In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import (confusion_matrix, 
                             f1_score, 
                             precision_score, 
                             recall_score, 
                             roc_auc_score, 
                             roc_curve, 
                             accuracy_score)
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from nltk import wordpunct_tokenize, word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


\

In [3]:
gc_data = pd.read_csv('cd_all_2020_data.csv')
gc_data
authors_cleaned = []
for author in gc_data.author:
    authors_cleaned.append(author.replace("\u00A0"," "))
gc_data.author = authors_cleaned


In [35]:
talks = gc_data[gc_data.author.str.contains('Nelson|Monson|Hinckley|Benson', regex=True)]
# talks = gc_data[gc_data.author.str.contains('Nelson|Monson', regex=True)]

targets = talks.apply(lambda x: x[0].split(' ')[-1], axis=1)
talks["target"] = targets

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  talks["target"] = targets


In [36]:
# talks[talks.target == ""].count()
gc_data[gc_data.author.str.contains('Benson', regex=True)].count()


author    62
date      62
text      62
dtype: int64

In [37]:
wn = WordNetLemmatizer()


def process_text(x):
    x = x.lower()
    tokens = wordpunct_tokenize(x)
    tokens = [tok for tok in tokens if tok.isalnum()]
    tokens = [tok for tok in tokens if tok not in sw]
    tokens = [wn.lemmatize(tok) for tok in tokens]
    return " ".join(tokens)

talks["processed_text"] = talks.text.apply(process_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  talks["processed_text"] = talks.text.apply(process_text)


In [38]:
train, test = train_test_split(talks, test_size=.3, 
                               stratify=talks.target, 
                               random_state=419)

y_train = train['target']
y_test = test['target']



In [39]:
tfidf = TfidfVectorizer(min_df=.005, stop_words=sw)
tfidf.fit(train['processed_text'])
print(len(tfidf.get_feature_names()))

X_train = tfidf.transform(train['processed_text'])
X_test = tfidf.transform(test['processed_text'])

nb = MultinomialNB()
nb.fit(X_train, y_train)
yhat = nb.predict(X_test)
y_prob = nb.predict_proba(X_test)[:,1]


9277


In [40]:
print(accuracy_score(y_test, yhat))
yhat

0.6777251184834123


array(['Monson', 'Monson', 'Hinckley', 'Monson', 'Hinckley', 'Monson',
       'Monson', 'Hinckley', 'Monson', 'Monson', 'Hinckley', 'Hinckley',
       'Monson', 'Hinckley', 'Monson', 'Hinckley', 'Monson', 'Hinckley',
       'Hinckley', 'Hinckley', 'Monson', 'Monson', 'Monson', 'Monson',
       'Monson', 'Monson', 'Monson', 'Monson', 'Monson', 'Hinckley',
       'Monson', 'Hinckley', 'Monson', 'Monson', 'Hinckley', 'Nelson',
       'Monson', 'Monson', 'Monson', 'Hinckley', 'Hinckley', 'Hinckley',
       'Monson', 'Monson', 'Monson', 'Hinckley', 'Monson', 'Monson',
       'Hinckley', 'Hinckley', 'Monson', 'Monson', 'Hinckley', 'Hinckley',
       'Hinckley', 'Monson', 'Hinckley', 'Hinckley', 'Monson', 'Monson',
       'Hinckley', 'Monson', 'Monson', 'Hinckley', 'Monson', 'Monson',
       'Monson', 'Monson', 'Hinckley', 'Hinckley', 'Monson', 'Monson',
       'Monson', 'Hinckley', 'Hinckley', 'Monson', 'Hinckley', 'Hinckley',
       'Monson', 'Monson', 'Hinckley', 'Hinckley', 'Monson', 'Mon

In [41]:
nelson_christmas = pd.read_csv('nelson_christmas.csv')
nelson_christmas_vector = tfidf.transform(nelson_christmas['text'])


In [42]:
nb.predict(nelson_christmas_vector)


array(['Monson'], dtype='<U8')

In [43]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

# Random Forest
rf = RandomForestClassifier(n_jobs=-1)
parameters = {
    'n_estimators': [50, 250, 500],
    'max_depth': [5, 10, 50, 100, None]
}

cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(X_train, y_train)


GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1),
             param_grid={'max_depth': [5, 10, 50, 100, None],
                         'n_estimators': [50, 250, 500]})

In [44]:
print_results(cv)
# 0.817 BEST PARAMS: {'max_depth': 50, 'n_estimators': 250}
# 0.798 (+/-0.115) for {'max_depth': 50, 'n_estimators': 500}



BEST PARAMS: {'max_depth': None, 'n_estimators': 500}

0.748 (+/-0.083) for {'max_depth': 5, 'n_estimators': 50}
0.746 (+/-0.044) for {'max_depth': 5, 'n_estimators': 250}
0.748 (+/-0.046) for {'max_depth': 5, 'n_estimators': 500}
0.791 (+/-0.058) for {'max_depth': 10, 'n_estimators': 50}
0.817 (+/-0.046) for {'max_depth': 10, 'n_estimators': 250}
0.815 (+/-0.052) for {'max_depth': 10, 'n_estimators': 500}
0.805 (+/-0.079) for {'max_depth': 50, 'n_estimators': 50}
0.839 (+/-0.038) for {'max_depth': 50, 'n_estimators': 250}
0.839 (+/-0.036) for {'max_depth': 50, 'n_estimators': 500}
0.825 (+/-0.045) for {'max_depth': 100, 'n_estimators': 50}
0.833 (+/-0.029) for {'max_depth': 100, 'n_estimators': 250}
0.835 (+/-0.054) for {'max_depth': 100, 'n_estimators': 500}
0.825 (+/-0.028) for {'max_depth': None, 'n_estimators': 50}
0.825 (+/-0.047) for {'max_depth': None, 'n_estimators': 250}
0.839 (+/-0.046) for {'max_depth': None, 'n_estimators': 500}


In [51]:
rf = RandomForestClassifier(max_depth=None, n_estimators=500, n_jobs=-1)
rf.fit(X_train, y_train)
yhatrf = (rf.predict(X_test))
print(accuracy_score(y_test, yhatrf))
# 0.7555555555555555
# 0.8222222222222222



0.8483412322274881


In [46]:
rf.predict(nelson_christmas_vector)

array(['Nelson'], dtype=object)

In [47]:
ada = AdaBoostClassifier()
parameters = {
    'n_estimators': [300, 500, 700],
    'learning_rate': [.5, 1, 5, 10]
}
cv = GridSearchCV(ada, parameters, cv=5)
cv.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=AdaBoostClassifier(),
             param_grid={'learning_rate': [0.5, 1, 5, 10],
                         'n_estimators': [300, 500, 700]})

In [48]:
print_results(cv)
# 0.673 BEST PARAMS: {'learning_rate': 0.5, 'n_estimators': 700}


BEST PARAMS: {'learning_rate': 0.5, 'n_estimators': 700}

0.705 (+/-0.099) for {'learning_rate': 0.5, 'n_estimators': 300}
0.73 (+/-0.089) for {'learning_rate': 0.5, 'n_estimators': 500}
0.75 (+/-0.052) for {'learning_rate': 0.5, 'n_estimators': 700}
0.703 (+/-0.095) for {'learning_rate': 1, 'n_estimators': 300}
0.732 (+/-0.131) for {'learning_rate': 1, 'n_estimators': 500}
0.742 (+/-0.076) for {'learning_rate': 1, 'n_estimators': 700}
0.39 (+/-0.117) for {'learning_rate': 5, 'n_estimators': 300}
0.421 (+/-0.153) for {'learning_rate': 5, 'n_estimators': 500}
0.475 (+/-0.107) for {'learning_rate': 5, 'n_estimators': 700}
0.159 (+/-0.11) for {'learning_rate': 10, 'n_estimators': 300}
0.175 (+/-0.071) for {'learning_rate': 10, 'n_estimators': 500}
0.171 (+/-0.078) for {'learning_rate': 10, 'n_estimators': 700}


In [49]:
ada = AdaBoostClassifier(learning_rate=.2, n_estimators=1000)
ada.fit(X_train, y_train)
yhatada = (ada.predict(X_test))
print(accuracy_score(y_test, yhatada))
# 0.7333333333333333


0.8483412322274881


In [52]:
ada.predict(nelson_christmas_vector)

array(['Monson'], dtype=object)

In [54]:
import numpy as np

import tensorflow as tf
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])
