In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import (confusion_matrix, 
                             f1_score, 
                             precision_score, 
                             recall_score, 
                             roc_auc_score, 
                             roc_curve, 
                             accuracy_score)
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from nltk import wordpunct_tokenize, word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


sw = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()


In [2]:
gc_data = pd.read_csv('cd_2011_2020_data.csv')
gc_data
authors_cleaned = []
for author in gc_data.author:
    authors_cleaned.append(author.replace("\u00A0"," "))
gc_data.author = authors_cleaned


In [3]:
talks = gc_data[gc_data.author.str.contains('Nelson|Monson|Uchtdorf|R. Holland|Bednar', regex=True)]
# talks = gc_data[gc_data.author.str.contains('Nelson|Monson', regex=True)]

targets = talks.apply(lambda x: x[0].split(' ')[-1], axis=1)
talks["target"] = targets

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  talks["target"] = targets


In [4]:
talks.author.str.contains('Bednar', regex=True)

0      False
21     False
24     False
25      True
26     False
       ...  
736    False
742    False
743    False
750    False
754    False
Name: author, Length: 169, dtype: bool

In [5]:
wn = WordNetLemmatizer()


def process_text(x):
    x = x.lower()
    tokens = wordpunct_tokenize(x)
    tokens = [tok for tok in tokens if tok.isalnum()]
    tokens = [tok for tok in tokens if tok not in sw]
    tokens = [wn.lemmatize(tok) for tok in tokens]
    return " ".join(tokens)

talks["processed_text"] = talks.text.apply(process_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  talks["processed_text"] = talks.text.apply(process_text)


In [6]:
train, test = train_test_split(talks, test_size=.3, 
                               stratify=talks.target, 
                               random_state=419)

y_train = train['target']
y_test = test['target']



In [7]:
tfidf = TfidfVectorizer(min_df=.005, stop_words=sw)
tfidf.fit(train['processed_text'])
print(len(tfidf.get_feature_names()))

X_train = tfidf.transform(train['processed_text'])
X_test = tfidf.transform(test['processed_text'])

nb = MultinomialNB()
nb.fit(X_train, y_train)
yhat = nb.predict(X_test)
y_prob = nb.predict_proba(X_test)[:,1]


9534


In [8]:
print(accuracy_score(y_test, yhat))


0.5490196078431373


array(['Uchtdorf', 'Monson', 'Uchtdorf', 'Uchtdorf', 'Monson', 'Uchtdorf',
       'Monson', 'Monson', 'Uchtdorf', 'Monson', 'Uchtdorf', 'Uchtdorf',
       'Uchtdorf', 'Uchtdorf', 'Nelson', 'Uchtdorf', 'Nelson', 'Uchtdorf',
       'Uchtdorf', 'Uchtdorf', 'Uchtdorf', 'Uchtdorf', 'Uchtdorf',
       'Uchtdorf', 'Uchtdorf', 'Monson', 'Uchtdorf', 'Uchtdorf', 'Monson',
       'Uchtdorf', 'Uchtdorf', 'Nelson', 'Uchtdorf', 'Uchtdorf', 'Nelson',
       'Uchtdorf', 'Uchtdorf', 'Monson', 'Uchtdorf', 'Uchtdorf', 'Monson',
       'Nelson', 'Monson', 'Uchtdorf', 'Nelson', 'Uchtdorf', 'Uchtdorf',
       'Uchtdorf', 'Uchtdorf', 'Uchtdorf', 'Nelson'], dtype='<U8')

In [136]:
nelson_christmas = pd.read_csv('nelson_christmas.csv')
nelson_christmas_vector = tfidf.transform(nelson_christmas['text'])


In [137]:
# not correct: predicts uchtdorf not nelson
nb.predict(nelson_christmas_vector)


array(['Uchtdorf'], dtype='<U8')

In [138]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

# Random Forest
rf = RandomForestClassifier(n_jobs=-1)
parameters = {
    'n_estimators': [50, 250, 500],
    'max_depth': [5, 10, 50, 100, None]
}

cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(X_train, y_train)


GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1),
             param_grid={'max_depth': [5, 10, 50, 100, None],
                         'n_estimators': [50, 250, 500]})

In [139]:
print_results(cv)
# 0.817 BEST PARAMS: {'max_depth': 50, 'n_estimators': 250}
# 0.798 (+/-0.115) for {'max_depth': 50, 'n_estimators': 500}



BEST PARAMS: {'max_depth': 100, 'n_estimators': 500}

0.677 (+/-0.182) for {'max_depth': 5, 'n_estimators': 50}
0.669 (+/-0.069) for {'max_depth': 5, 'n_estimators': 250}
0.694 (+/-0.075) for {'max_depth': 5, 'n_estimators': 500}
0.662 (+/-0.084) for {'max_depth': 10, 'n_estimators': 50}
0.728 (+/-0.168) for {'max_depth': 10, 'n_estimators': 250}
0.72 (+/-0.141) for {'max_depth': 10, 'n_estimators': 500}
0.678 (+/-0.042) for {'max_depth': 50, 'n_estimators': 50}
0.729 (+/-0.087) for {'max_depth': 50, 'n_estimators': 250}
0.746 (+/-0.122) for {'max_depth': 50, 'n_estimators': 500}
0.678 (+/-0.125) for {'max_depth': 100, 'n_estimators': 50}
0.72 (+/-0.05) for {'max_depth': 100, 'n_estimators': 250}
0.788 (+/-0.077) for {'max_depth': 100, 'n_estimators': 500}
0.702 (+/-0.131) for {'max_depth': None, 'n_estimators': 50}
0.754 (+/-0.104) for {'max_depth': None, 'n_estimators': 250}
0.737 (+/-0.151) for {'max_depth': None, 'n_estimators': 500}


In [140]:
rf = RandomForestClassifier(max_depth=50, n_estimators=500, n_jobs=-1)
rf.fit(X_train, y_train)
yhatrf = (rf.predict(X_test))
print(accuracy_score(y_test, yhatrf))
# 0.7555555555555555
# 0.8222222222222222



0.803921568627451


In [141]:
rf.predict(nelson_christmas_vector)

array(['Nelson'], dtype=object)

In [142]:
ada = AdaBoostClassifier()
parameters = {
    'n_estimators': [300, 500, 700],
    'learning_rate': [.5, 1, 5, 10]
}
cv = GridSearchCV(ada, parameters, cv=5)
cv.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=AdaBoostClassifier(),
             param_grid={'learning_rate': [0.5, 1, 5, 10],
                         'n_estimators': [300, 500, 700]})

In [143]:
print_results(cv)
# 0.673 BEST PARAMS: {'learning_rate': 0.5, 'n_estimators': 700}


BEST PARAMS: {'learning_rate': 0.5, 'n_estimators': 700}

0.611 (+/-0.175) for {'learning_rate': 0.5, 'n_estimators': 300}
0.636 (+/-0.225) for {'learning_rate': 0.5, 'n_estimators': 500}
0.653 (+/-0.214) for {'learning_rate': 0.5, 'n_estimators': 700}
0.397 (+/-0.127) for {'learning_rate': 1, 'n_estimators': 300}
0.397 (+/-0.127) for {'learning_rate': 1, 'n_estimators': 500}
0.397 (+/-0.127) for {'learning_rate': 1, 'n_estimators': 700}
0.543 (+/-0.166) for {'learning_rate': 5, 'n_estimators': 300}
0.609 (+/-0.096) for {'learning_rate': 5, 'n_estimators': 500}
0.617 (+/-0.186) for {'learning_rate': 5, 'n_estimators': 700}
0.288 (+/-0.251) for {'learning_rate': 10, 'n_estimators': 300}
0.247 (+/-0.142) for {'learning_rate': 10, 'n_estimators': 500}
0.279 (+/-0.153) for {'learning_rate': 10, 'n_estimators': 700}


In [144]:
ada = AdaBoostClassifier(learning_rate=.2, n_estimators=1000)
ada.fit(X_train, y_train)
yhatada = (ada.predict(X_test))
print(accuracy_score(y_test, yhatada))
# 0.7333333333333333


0.7843137254901961


In [145]:
ada.predict(nelson_christmas_vector)

array(['Nelson'], dtype=object)