In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import (confusion_matrix, 
                             f1_score, 
                             precision_score, 
                             recall_score, 
                             roc_auc_score, 
                             roc_curve, 
                             accuracy_score)
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from nltk import wordpunct_tokenize, word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


sw = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()


In [3]:
gc_data = pd.read_csv('cd_2011_2020_data.csv')
gc_data
authors_cleaned = []
for author in gc_data.author:
    authors_cleaned.append(author.replace("\u00A0"," "))
gc_data.author = authors_cleaned


In [4]:
talks = gc_data[gc_data.author.str.contains('Nelson|Monson|Uchtdorf|R. Holland|Bednar', regex=True)]

targets = talks.apply(lambda x: x[0].split(' ')[-1], axis=1)
talks["target"] = targets

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  talks["target"] = targets


In [5]:
print(talks.shape)
talks.head()

(169, 4)


Unnamed: 0,author,date,text,target
0,President Russell M. Nelson,2020/04,"My beloved brothers and sisters, as we welcome...",Nelson
21,President Russell M. Nelson,2020/04,What a unique and wonderful session this has b...,Nelson
24,Elder Jeffrey R. Holland,2020/04,"Last October, President Russell M. Nelson invi...",Holland
25,Elder David A. Bednar,2020/04,"In the Sacred Grove 200 years ago, young Josep...",Bednar
26,President Russell M. Nelson,2020/04,"My dear brothers and sisters, how thankful I a...",Nelson


In [6]:
wn = WordNetLemmatizer()


def process_text(x):
    x = x.lower()
    tokens = wordpunct_tokenize(x)
    tokens = [tok for tok in tokens if tok.isalnum()]
    tokens = [tok for tok in tokens if tok not in sw]
    tokens = [wn.lemmatize(tok) for tok in tokens]
    return " ".join(tokens)

talks["processed_text"] = talks.text.apply(process_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  talks["processed_text"] = talks.text.apply(process_text)


In [7]:
train, test = train_test_split(talks, test_size=.3, 
                               stratify=talks.target, 
                               random_state=419)

y_train = train['target']
y_test = test['target']



In [8]:
tfidf = TfidfVectorizer(min_df=.005, stop_words=sw)
tfidf.fit(train['processed_text'])
print(len(tfidf.get_feature_names()))

X_train = tfidf.transform(train['processed_text'])
X_test = tfidf.transform(test['processed_text'])

nb = MultinomialNB()
nb.fit(X_train, y_train)
yhat = nb.predict(X_test)
y_prob = nb.predict_proba(X_test)[:,1]


9534


In [9]:
print(accuracy_score(y_test, yhat))


0.5490196078431373


In [10]:
print(nb.classes_)
confusion_matrix(y_test, yhat)
# predicts most of talks as Uchtdorf


['Bednar' 'Holland' 'Monson' 'Nelson' 'Uchtdorf']


array([[ 0,  0,  1,  0,  5],
       [ 0,  0,  0,  0,  6],
       [ 0,  0,  8,  1,  4],
       [ 0,  0,  1,  6,  5],
       [ 0,  0,  0,  0, 14]])

In [11]:
nelson_christmas = pd.read_csv('nelson_christmas.csv')
nelson_christmas_vector = tfidf.transform(nelson_christmas['text'])


In [12]:
# not correct: predicts uchtdorf not nelson
nb.predict(nelson_christmas_vector)


array(['Uchtdorf'], dtype='<U8')

In [31]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

# Random Forest
rf = RandomForestClassifier(n_jobs=-1)
parameters = {
    'n_estimators': [50, 250, 500],
    'max_depth': [5, 10, 50, 100, None]
}

cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(X_train, y_train)


GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1),
             param_grid={'max_depth': [5, 10, 50, 100, None],
                         'n_estimators': [50, 250, 500]})

In [32]:
print_results(cv)
# 0.817 BEST PARAMS: {'max_depth': 50, 'n_estimators': 250}
# 0.798 (+/-0.115) for {'max_depth': 50, 'n_estimators': 500}



BEST PARAMS: {'max_depth': 100, 'n_estimators': 250}

0.678 (+/-0.074) for {'max_depth': 5, 'n_estimators': 50}
0.728 (+/-0.125) for {'max_depth': 5, 'n_estimators': 250}
0.703 (+/-0.063) for {'max_depth': 5, 'n_estimators': 500}
0.737 (+/-0.063) for {'max_depth': 10, 'n_estimators': 50}
0.72 (+/-0.088) for {'max_depth': 10, 'n_estimators': 250}
0.729 (+/-0.09) for {'max_depth': 10, 'n_estimators': 500}
0.661 (+/-0.077) for {'max_depth': 50, 'n_estimators': 50}
0.736 (+/-0.134) for {'max_depth': 50, 'n_estimators': 250}
0.737 (+/-0.126) for {'max_depth': 50, 'n_estimators': 500}
0.686 (+/-0.153) for {'max_depth': 100, 'n_estimators': 50}
0.754 (+/-0.106) for {'max_depth': 100, 'n_estimators': 250}
0.737 (+/-0.132) for {'max_depth': 100, 'n_estimators': 500}
0.745 (+/-0.126) for {'max_depth': None, 'n_estimators': 50}
0.72 (+/-0.18) for {'max_depth': None, 'n_estimators': 250}
0.728 (+/-0.197) for {'max_depth': None, 'n_estimators': 500}


In [13]:
rf = RandomForestClassifier(max_depth=100, n_estimators=250, n_jobs=-1)
rf.fit(X_train, y_train)
yhatrf = (rf.predict(X_test))
print(accuracy_score(y_test, yhatrf))


0.8431372549019608


In [17]:
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

top_word_indicies = []
for f in range(10):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
    top_word_indicies.append(indices[f])

print("\n")


# Print top features   
for ind in top_word_indicies:
    print(tfidf.get_feature_names()[ind])

Feature ranking:
1. feature 2184 (0.011652)
2. feature 6151 (0.008166)
3. feature 508 (0.007388)
4. feature 7990 (0.007344)
5. feature 6429 (0.005751)
6. feature 7780 (0.005681)
7. feature 6077 (0.005626)
8. feature 3559 (0.004546)
9. feature 5316 (0.004406)
10. feature 5526 (0.004235)


dear
perhaps
amen
spiritual
prayer
sister
path
found
may
month


In [49]:
print(rf.classes_)
confusion_matrix(y_test, yhatrf)
# Still predicts most of talks as Uchtdorf, but not as bad.


['Bednar' 'Holland' 'Monson' 'Nelson' 'Uchtdorf']


array([[ 5,  0,  0,  1,  0],
       [ 0,  3,  1,  0,  2],
       [ 0,  0, 12,  0,  1],
       [ 0,  0,  2,  8,  2],
       [ 0,  0,  0,  0, 14]])

In [54]:
rf.predict(nelson_christmas_vector)

array(['Nelson'], dtype=object)

In [57]:
ada = AdaBoostClassifier()
parameters = {
    'n_estimators': [300, 500, 700],
    'learning_rate': [.2, .5, 1, 5, 10]
}
cv = GridSearchCV(ada, parameters, cv=5)
cv.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=AdaBoostClassifier(),
             param_grid={'learning_rate': [0.2, 0.5, 1, 5, 10],
                         'n_estimators': [300, 500, 700]})

In [58]:
print_results(cv)


BEST PARAMS: {'learning_rate': 0.5, 'n_estimators': 700}

0.636 (+/-0.16) for {'learning_rate': 0.2, 'n_estimators': 300}
0.652 (+/-0.134) for {'learning_rate': 0.2, 'n_estimators': 500}
0.636 (+/-0.103) for {'learning_rate': 0.2, 'n_estimators': 700}
0.619 (+/-0.202) for {'learning_rate': 0.5, 'n_estimators': 300}
0.654 (+/-0.264) for {'learning_rate': 0.5, 'n_estimators': 500}
0.678 (+/-0.191) for {'learning_rate': 0.5, 'n_estimators': 700}
0.397 (+/-0.127) for {'learning_rate': 1, 'n_estimators': 300}
0.397 (+/-0.127) for {'learning_rate': 1, 'n_estimators': 500}
0.397 (+/-0.127) for {'learning_rate': 1, 'n_estimators': 700}
0.577 (+/-0.146) for {'learning_rate': 5, 'n_estimators': 300}
0.609 (+/-0.137) for {'learning_rate': 5, 'n_estimators': 500}
0.551 (+/-0.309) for {'learning_rate': 5, 'n_estimators': 700}
0.287 (+/-0.217) for {'learning_rate': 10, 'n_estimators': 300}
0.185 (+/-0.151) for {'learning_rate': 10, 'n_estimators': 500}
0.221 (+/-0.209) for {'learning_rate': 10, 'n_e

In [59]:
ada = AdaBoostClassifier(learning_rate=0.5, n_estimators=700)
ada.fit(X_train, y_train)
yhatada = (ada.predict(X_test))
print(accuracy_score(y_test, yhatada))


0.7843137254901961


In [60]:
print(ada.classes_)
confusion_matrix(y_test, yhatada)
# Still predicts most of talks as Uchtdorf, but not as bad.

['Bednar' 'Holland' 'Monson' 'Nelson' 'Uchtdorf']


array([[ 2,  0,  1,  1,  2],
       [ 0,  3,  3,  0,  0],
       [ 0,  0, 11,  1,  1],
       [ 0,  0,  0, 10,  2],
       [ 0,  0,  0,  0, 14]])

In [61]:
ada.predict(nelson_christmas_vector)

array(['Nelson'], dtype=object)

In [None]:
 -