## Model Benchmarks

In [1]:
#import libraries
from matplotlib import pyplot as plt

import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import requests
import time
import random
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from bs4 import BeautifulSoup             
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
stop_words = set(stopwords.words('english'))

In [2]:
df = pd.read_csv('../data/train_combine.csv')

In [3]:
df.isnull().sum()

subreddit        0
combine_text     0
clean_combine    6
is_coffee        0
dtype: int64

In [4]:
df.dropna(inplace = True)

In [5]:
df

Unnamed: 0,subreddit,combine_text,clean_combine,is_coffee
0,Coffee,\n\nWelcome to the daily [/r/Coffee](https://...,welcome daily thread stupid ask answer start s...,1
1,Coffee,Welcome to the /r/Coffee deal and promotional ...,welcome deal promotional thread weekly thread ...,1
2,Coffee,Had a barista look at me like o was an idiot a...,barista look idiot ask flat white armed exact ...,1
3,Coffee,"Hello everyone, I’m new to this sub so please ...",hello everyone new sub please direct topic fit...,1
4,Coffee,I've been getting pre-ground coffee (Bustelo/L...,pre ground bustelo lavazza moka pot recently s...,1
...,...,...,...,...
1906,tea,Brand new to Tea! Has anyone tried Vahdam tea...,brand new anyone vahdam matcha,0
1907,tea,"A nice, relaxing day with a Dan Cong oolong a...",nice relaxing day dan cong oolong secret spot,0
1908,tea,Clouds over a cup when tea is the sun.,cloud sun,0
1909,tea,Benefits? of getting loose leaf at a bar.,benefit loose leaf bar,0


### Define X, y and check baseline

In [6]:
# Define X, y 
X = df['clean_combine']
y = df['is_coffee']

In [7]:
y.value_counts(normalize = True)

0    0.514436
1    0.485564
Name: is_coffee, dtype: float64

### train/test/split

In [8]:
# Create train_test_split.
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.25,
                                                    random_state = 42,
                                                    stratify = y)

### Modeling

#### Use CountVectorizer for Logisctic regression and Naive Bayes

In [9]:
cvec = CountVectorizer()

In [10]:
train_data_features = cvec.fit_transform(X_train)

test_data_features =cvec.transform(X_test)

In [11]:
# Instantiate logistic regression model.
lr = LogisticRegression()

# Fit model to training data.
lr.fit(train_data_features, y_train)

# Evaluate model on training data.
lr.score(train_data_features, y_train)

0.9943977591036415

In [12]:
lr.score(test_data_features, y_test)

0.89937106918239

In [13]:
nb = MultinomialNB()

In [14]:
# Fit model to training data.
nb.fit(train_data_features, y_train)

# Evaluate model on training data.
nb.score(train_data_features, y_train)

0.9607843137254902

In [15]:
nb.score(test_data_features, y_test)

0.9203354297693921

#### Use TfidfVectorizer for Logisctic regression and Naive Bayes

In [16]:
tvec = TfidfVectorizer()

In [17]:
train_data_features = tvec.fit_transform(X_train)

test_data_features =tvec.transform(X_test)

In [18]:
# Instantiate logistic regression model.
lr = LogisticRegression()

# Fit model to training data.
lr.fit(train_data_features, y_train)

# Evaluate model on training data.
lr.score(train_data_features, y_train)

0.9754901960784313

In [19]:
lr.score(test_data_features, y_test)

0.909853249475891

In [20]:
nb = MultinomialNB()

In [21]:
nb.fit(train_data_features, y_train)

# Evaluate model on training data.
nb.score(train_data_features, y_train)

0.9705882352941176

In [22]:
nb.score(test_data_features, y_test)

0.9224318658280922

#### Use Pipeline for CountVectorizer Logisctic regression

In [23]:
pipe1 = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(max_iter=200)),
])

In [24]:
pipe1.get_params()

{'memory': None,
 'steps': [('cvec', CountVectorizer()),
  ('lr', LogisticRegression(max_iter=200))],
 'verbose': False,
 'cvec': CountVectorizer(),
 'lr': LogisticRegression(max_iter=200),
 'cvec__analyzer': 'word',
 'cvec__binary': False,
 'cvec__decode_error': 'strict',
 'cvec__dtype': numpy.int64,
 'cvec__encoding': 'utf-8',
 'cvec__input': 'content',
 'cvec__lowercase': True,
 'cvec__max_df': 1.0,
 'cvec__max_features': None,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 1),
 'cvec__preprocessor': None,
 'cvec__stop_words': None,
 'cvec__strip_accents': None,
 'cvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'cvec__tokenizer': None,
 'cvec__vocabulary': None,
 'lr__C': 1.0,
 'lr__class_weight': None,
 'lr__dual': False,
 'lr__fit_intercept': True,
 'lr__intercept_scaling': 1,
 'lr__l1_ratio': None,
 'lr__max_iter': 200,
 'lr__multi_class': 'auto',
 'lr__n_jobs': None,
 'lr__penalty': 'l2',
 'lr__random_state': None,
 'lr__solver': 'lbfgs',
 'lr__tol': 0.0001,
 'lr__verbose': 0,
 'lr__

In [25]:
pipe_params1 = {
    'cvec__max_features': [1000, 2000, 3000],
    'cvec__min_df': [2,3,4,],
    'cvec__max_df': [0.9, 0.95],
    'cvec__ngram_range': [(1,1), (1,2)],
    #'lr__C': [100, 10, 1.0, 0.1],
    'lr__C': [1.0, 0.1, 0.01],
    
}

In [26]:
gs1 = GridSearchCV(pipe1, # what object are we optimizing?
                  param_grid= pipe_params1, # what parameters values are we searching?
                  cv=5) # 5-fold cross-validation.

In [27]:
gs1.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('lr',
                                        LogisticRegression(max_iter=200))]),
             param_grid={'cvec__max_df': [0.9, 0.95],
                         'cvec__max_features': [1000, 2000, 3000],
                         'cvec__min_df': [2, 3, 4],
                         'cvec__ngram_range': [(1, 1), (1, 2)],
                         'lr__C': [1.0, 0.1, 0.01]})

In [28]:
gs1.best_score_

0.9082591093117409

In [29]:
gs1_model =gs1.best_estimator_

In [30]:
gs1_model.score(X_train, y_train)

0.992296918767507

In [31]:
gs1_model.score(X_test, y_test)

0.9077568134171907

In [32]:
gs1_model

Pipeline(steps=[('cvec',
                 CountVectorizer(max_df=0.9, max_features=3000, min_df=2,
                                 ngram_range=(1, 2))),
                ('lr', LogisticRegression(max_iter=200))])

- Comparing to default C (C=1) for Logistic regression, best C value for this model is 10. High C value in logistic regression has weaker regularization strength. So we see higher overfitting in this model. 

#### Use Pipeline for TfidfVectorizer Logisctic regression

In [33]:
pipe2 = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('lr', LogisticRegression(max_iter = 200)),
])

In [34]:
pipe2.get_params()

{'memory': None,
 'steps': [('tvec', TfidfVectorizer()),
  ('lr', LogisticRegression(max_iter=200))],
 'verbose': False,
 'tvec': TfidfVectorizer(),
 'lr': LogisticRegression(max_iter=200),
 'tvec__analyzer': 'word',
 'tvec__binary': False,
 'tvec__decode_error': 'strict',
 'tvec__dtype': numpy.float64,
 'tvec__encoding': 'utf-8',
 'tvec__input': 'content',
 'tvec__lowercase': True,
 'tvec__max_df': 1.0,
 'tvec__max_features': None,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__norm': 'l2',
 'tvec__preprocessor': None,
 'tvec__smooth_idf': True,
 'tvec__stop_words': None,
 'tvec__strip_accents': None,
 'tvec__sublinear_tf': False,
 'tvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tvec__tokenizer': None,
 'tvec__use_idf': True,
 'tvec__vocabulary': None,
 'lr__C': 1.0,
 'lr__class_weight': None,
 'lr__dual': False,
 'lr__fit_intercept': True,
 'lr__intercept_scaling': 1,
 'lr__l1_ratio': None,
 'lr__max_iter': 200,
 'lr__multi_class': 'auto',
 'lr__n_jobs': None,
 'lr__penalty'

In [35]:
pipe_params2 = {
    'tvec__max_features': [1000, 2000, 3000],
    'tvec__min_df': [2,3,4],
    'tvec__max_df': [0.9, 0.95],
    'tvec__ngram_range': [(1,1), (1,2)],
    #'lr__C': [100, 10, 1.0, 0.1],
    'lr__C': [1.0, 0.1, 0.01],
}

In [36]:
gs2 = GridSearchCV(pipe2, # what object are we optimizing?
                  param_grid= pipe_params2, # what parameters values are we searching?
                  cv=5) # 5-fold cross-validation.

In [37]:
gs2.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('lr',
                                        LogisticRegression(max_iter=200))]),
             param_grid={'lr__C': [1.0, 0.1, 0.01], 'tvec__max_df': [0.9, 0.95],
                         'tvec__max_features': [1000, 2000, 3000],
                         'tvec__min_df': [2, 3, 4],
                         'tvec__ngram_range': [(1, 1), (1, 2)]})

In [38]:
gs2.best_score_

0.9152643847380689

In [39]:
gs2_model=gs2.best_estimator_

In [40]:
gs2_model.score(X_train, y_train)

0.9747899159663865

In [41]:
gs2_model.score(X_test, y_test)

0.9203354297693921

In [42]:
gs2_model

Pipeline(steps=[('tvec',
                 TfidfVectorizer(max_df=0.9, max_features=2000, min_df=2)),
                ('lr', LogisticRegression(max_iter=200))])

#### Use Pipeline for CountVectorizer Naive Bayes

In [43]:
pipe3 = Pipeline([
    ('cvec', CountVectorizer()),
    ('nb', MultinomialNB()),
])

In [44]:
pipe_params3 = {
    'cvec__max_features': [1000, 2000, 3000],
    'cvec__min_df': [2,3,4],
    'cvec__max_df': [0.9, 0.95],
    'cvec__ngram_range': [(1,1), (1,2)],
    #'nb__alpha': [0.1, 1, 10, 100]
    'nb__alpha': [1, 10, 100]
    #'nb__alpha' : np.linspace(0.1, 1.0, 20),
}

In [45]:
pipe_params3_1 = {
    'cvec__max_features': [100, 200, 300],
    'cvec__min_df': [2,3,4],
    'cvec__max_df': [0.9, 0.95],
    'cvec__ngram_range': [(1,1), (1,2)],
    'nb__alpha': [0.1, 1, 10, 100]
}

In [46]:
gs3 = GridSearchCV(pipe3, # what object are we optimizing?
                  param_grid= pipe_params3, # what parameters values are we searching?
                  cv=5) # 5-fold cross-validation.

In [47]:
gs3_1 = GridSearchCV(pipe3, # what object are we optimizing?
                  param_grid= pipe_params3_1, # what parameters values are we searching?
                  cv=5) # 5-fold cross-validation.

In [48]:
gs3.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('nb', MultinomialNB())]),
             param_grid={'cvec__max_df': [0.9, 0.95],
                         'cvec__max_features': [1000, 2000, 3000],
                         'cvec__min_df': [2, 3, 4],
                         'cvec__ngram_range': [(1, 1), (1, 2)],
                         'nb__alpha': [1, 10, 100]})

In [49]:
gs3.best_score_

0.9019654030180344

In [50]:
gs3_model =gs3.best_estimator_

In [51]:
gs3_model.score(X_train, y_train)

0.9565826330532213

In [52]:
gs3_model.score(X_test, y_test)

0.9329140461215933

In [53]:
gs3_model

Pipeline(steps=[('cvec',
                 CountVectorizer(max_df=0.9, max_features=3000, min_df=2)),
                ('nb', MultinomialNB(alpha=1))])

In [54]:
#if i put nb__alpha in the grid search, best_params for it is 0.1 and we get 0.95 train score

#### Use Pipeline for CountVectorizer Naive Bayes to change max features

In [55]:
#gs3_1.fit(X_train,y_train)

In [56]:
#gs3_1_model =gs3_1.best_estimator_

In [57]:
#gs3_1_model.score(X_train, y_train)

In [58]:
#gs3_1_model.score(X_test, y_test)

In [59]:
#gs3_1_model

#### Use Pipeline for TfidfVectorizer Naive Bayes

In [60]:
pipe4 = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])

In [61]:
pipe4.get_params()

{'memory': None,
 'steps': [('tvec', TfidfVectorizer()), ('nb', MultinomialNB())],
 'verbose': False,
 'tvec': TfidfVectorizer(),
 'nb': MultinomialNB(),
 'tvec__analyzer': 'word',
 'tvec__binary': False,
 'tvec__decode_error': 'strict',
 'tvec__dtype': numpy.float64,
 'tvec__encoding': 'utf-8',
 'tvec__input': 'content',
 'tvec__lowercase': True,
 'tvec__max_df': 1.0,
 'tvec__max_features': None,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__norm': 'l2',
 'tvec__preprocessor': None,
 'tvec__smooth_idf': True,
 'tvec__stop_words': None,
 'tvec__strip_accents': None,
 'tvec__sublinear_tf': False,
 'tvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tvec__tokenizer': None,
 'tvec__use_idf': True,
 'tvec__vocabulary': None,
 'nb__alpha': 1.0,
 'nb__class_prior': None,
 'nb__fit_prior': True}

In [62]:
pipe_params4 = {
    'tvec__max_features': [1000, 2000, 3000],
    'tvec__min_df': [2,3,4],
    'tvec__max_df': [0.9, 0.95],
    'tvec__ngram_range': [(1,1), (1,2)],
    'nb__alpha': [1, 10, 100]
    #'nb__alpha': [0.1, 1, 10, 100]
}

In [63]:
gs4 = GridSearchCV(pipe4, # what object are we optimizing?
                  param_grid= pipe_params4, # what parameters values are we searching?
                  cv=5) # 5-fold cross-validation.

In [64]:
gs4.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             param_grid={'nb__alpha': [1, 10, 100], 'tvec__max_df': [0.9, 0.95],
                         'tvec__max_features': [1000, 2000, 3000],
                         'tvec__min_df': [2, 3, 4],
                         'tvec__ngram_range': [(1, 1), (1, 2)]})

In [65]:
gs4.best_score_

0.9019555882713778

In [66]:
gs4_model =gs4.best_estimator_

In [67]:
gs4_model.score(X_train, y_train)

0.9502801120448179

In [68]:
gs4_model.score(X_test, y_test)

0.9119496855345912

In [69]:
gs4_model

Pipeline(steps=[('tvec',
                 TfidfVectorizer(max_df=0.9, max_features=2000, min_df=3,
                                 ngram_range=(1, 2))),
                ('nb', MultinomialNB(alpha=1))])

#### Random Forest

In [70]:
pipe5 = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('rf', RandomForestClassifier()),
])

In [71]:
pipe5.get_params()

{'memory': None,
 'steps': [('tvec', TfidfVectorizer()), ('rf', RandomForestClassifier())],
 'verbose': False,
 'tvec': TfidfVectorizer(),
 'rf': RandomForestClassifier(),
 'tvec__analyzer': 'word',
 'tvec__binary': False,
 'tvec__decode_error': 'strict',
 'tvec__dtype': numpy.float64,
 'tvec__encoding': 'utf-8',
 'tvec__input': 'content',
 'tvec__lowercase': True,
 'tvec__max_df': 1.0,
 'tvec__max_features': None,
 'tvec__min_df': 1,
 'tvec__ngram_range': (1, 1),
 'tvec__norm': 'l2',
 'tvec__preprocessor': None,
 'tvec__smooth_idf': True,
 'tvec__stop_words': None,
 'tvec__strip_accents': None,
 'tvec__sublinear_tf': False,
 'tvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tvec__tokenizer': None,
 'tvec__use_idf': True,
 'tvec__vocabulary': None,
 'rf__bootstrap': True,
 'rf__ccp_alpha': 0.0,
 'rf__class_weight': None,
 'rf__criterion': 'gini',
 'rf__max_depth': None,
 'rf__max_features': 'auto',
 'rf__max_leaf_nodes': None,
 'rf__max_samples': None,
 'rf__min_impurity_decrease': 0.0,
 '

In [72]:
pipe_params5 = {
    'tvec__max_features': [1000, 2000, 3000],
    'tvec__min_df': [2,3,4],
    'tvec__ngram_range': [(1,1), (1,2)],
    'rf__n_estimators': [100,200,300],
    'rf__max_depth': [None,2,4,6,8],
    'rf__max_features': ['sqrt', 'log2']     
}

In [73]:
gs5 = GridSearchCV(pipe5, param_grid=pipe_params5, cv=5)

In [74]:
#gs5.fit(X_train, y_train)

In [75]:
#gs5.score(X_train, y_train)

In [76]:
#gs5.score(X_test, y_test)

#### Compare and choose best model 

Model Name | Vectorizer | Train Score|Test Score|Train/Test Score gap
-|-|-|-|-
Logistic Regression|CountVectorizer|99.2%|90.8%|8.4%
Logistic Regression|TfidfVectorizer|97.5%|92.0%|5.5%
Naive Bayes|CountVectorizer|95.7%|93.3%|2.4%
Naive Bayes|TfidfVectorizer|95.0%|91.2%|3.8%

According to the score table, for logistic regression, with both CountVectorizer and TfidfVectorizer, the gap between train score and test score are quite large, i.e. 5.5-8.4% with highly overfitting.

For Naive Bayes, with both CountVectorizer and TfidfVectorizer, the gap between train score and test data is smaller, i.e. 2.4-3.8%. The model still overfits but not too much comparing to logistic regression.

Among these 4 models, I choose Naive Bayes combined with CountVectorizer as best model because this model does not overfit a lot. 

#### Confusion Table with best model

In [77]:
# Generate a confusion matrix
from sklearn.metrics import confusion_matrix
y_preds = gs3.predict(X_test)

pd.DataFrame(confusion_matrix(y_test, y_preds),
            columns=['predict tea', 'predict coffee'],
            index=['actual tea', 'actual coffee'])

Unnamed: 0,predict tea,predict coffee
actual tea,230,15
actual coffee,17,215


In [78]:
# Examine some classification metrics 
tn, fp, fn, tp = confusion_matrix(y_test, y_preds).ravel()
print('Accuracy: {}'.format(round((tp+tn)/(tp+fp+tn+fn),4)))
print('Misclassification rate: {}'.format(round((fp+fn)/(tp+fp+tn+fn),4)))
print('Precision: {}'.format(round(tp/(tp+fp),4)))
print('Recall: {}'.format(round(tp/(tp+fn),4)))
print('Specificity: {}'.format(round(tn/(tn+fp),4)))

Accuracy: 0.9329
Misclassification rate: 0.0671
Precision: 0.9348
Recall: 0.9267
Specificity: 0.9388


In [79]:
preds_prob = gs3.predict_proba(X_test)

In [80]:
preds= pd.DataFrame({
    'clean_combine':X_test,
    'preds_prob':[preds_prob[i][1] for i in range(len(preds_prob))],
    'preds': y_preds,
    'true_y':y_test 
})
preds

Unnamed: 0,clean_combine,preds_prob,preds,true_y
1697,sorry place ask search internet really find an...,6.917611e-20,0,0
1118,recent haul arrive,1.882665e-02,0,0
1907,nice relaxing day dan cong oolong secret spot,4.142709e-04,0,0
1276,nerdy stir heat plate matcha setup work amazing,2.625289e-02,0,0
136,hey sorry address wonder bit hope lurker may f...,1.000000e+00,1,1
...,...,...,...,...
800,miroco milk heater frother bit disspointe capa...,1.000000e+00,1,1
133,niche killer jam hoffman,9.924121e-01,1,1
1761,hey everyone boyfriend really also travel want...,7.192477e-04,0,0
769,light roasted ethiopian yirgacheffe bean smell...,1.000000e+00,1,1


In [81]:
preds['diff'] = preds['preds'] - preds['true_y']

In [82]:
# predict is coffee post, but actual is tea post

for i in preds.loc[preds['diff'] == 1].clean_combine:
    print(i)
    print('-'*20)

buy haru bancha yuuki cha recommend vender gram dry minute still taste light way taste strong gram always taste light way haru bancha strong
--------------------
see cabinet low crate really milk crate
--------------------
year journey finally space money table chair issue drag outside want gentle breeze
--------------------
shoot picture cheap canon cheap haha
--------------------
startet collect patina thought patina
--------------------
decide give space
--------------------
bridal shower party success
--------------------
aficionado little pot filter build inside ease boiling add extract many gram would say day extract indian ground look loose gifted many day safe extract grind
--------------------
original gift idea fail great backup
--------------------
book well day
--------------------
quick really find answer online head eastern european western asia bit wanting bring back kilo wonder anyone restriction sort family plenty suitcase space issue turkish airline allow check bag pe

- Inside these mis-prediction, there are some words, like 'taste', 'would', etc are among the top words in both coffee and tea CountVectorizer tokens. We can consider to remove the common words. 

In [83]:
preds.loc[preds['diff'] == -1]

Unnamed: 0,clean_combine,preds_prob,preds,true_y,diff
566,hello import family farm salvador united state...,7.780633e-06,0,1,-1
421,hello guy look purchase mineral mixture simila...,0.1315962,0,1,-1
181,last year seem clear market high quality seem ...,0.03181742,0,1,-1
615,hello tour latin america currently guatemala l...,0.003759533,0,1,-1
345,anyone townsend english heritage taste history...,0.0003385136,0,1,-1
362,amazon silver wilfa svart worth pick price wil...,0.1294736,0,1,-1
578,guy great take turkey baster force intake forc...,0.178599,0,1,-1
178,big bottle infuse mint leave lead idea anyone ...,2.699787e-06,0,1,-1
117,howdy love taste however totally caffeine into...,0.2256067,0,1,-1
378,gon take guess sub put working research projec...,0.001388888,0,1,-1


In [84]:
# predict is tea post, but actual is coffee post

for i in preds.loc[preds['diff'] == -1].clean_combine:
    print(i)
    print('-'*20)    

hello import family farm salvador united state start import already roast package sell online future import green roasted package anyone need type fda certification anything import already roast package united state would love help seem find anything fda website process need also anyone recommendation freight forwarder someone help custom process since would first time import help advice would appreciate thank import salvador usa
--------------------
hello guy look purchase mineral mixture similar offer perfect third wave live europe want pay euro ship european company supply similar product thank mineral mix
--------------------
last year seem clear market high quality seem people want high quality small viable market high scoring quality price regular quality market top quality third wave
--------------------
hello tour latin america currently guatemala look comprehensive map encyclopedia different type grow central south america visit anyone recommend book website find list map form

- Most of these posts have the word 'anyone' in them, which could be a significant term that misclassified them as tea posts. 'anyone' is in the top20 word list for tea post CountVectorize tokens.

## Conclusion and Recommendations