In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string


#ML
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import RandomizedSearchCV
#for execution time
import time


In [106]:
df = pd.read_csv('../Data/lotr/prepped_data.csv',dtype=str)

In [94]:
df.columns

Index(['char', 'dialog', 'movie', 'corpus_dialog'], dtype='object')

In [95]:
df.char.value_counts()

FRODO      229
SAM        218
GANDALF    215
ARAGORN    211
PIPPIN     162
MERRY      137
GOLLUM     134
GIMLI      116
THEODEN    109
FARAMIR     65
Name: char, dtype: int64

In [117]:
df = df[df.char.isin(['FRODO', 'GANDALF', 'ARAGORN', 'SAM'])]

In [74]:
df.columns

Index(['char', 'dialog', 'movie', 'corpus_dialog'], dtype='object')

In [75]:
def sklearn_models(model,X_train, X_test, y_train, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    labels = model.predict(X_test)
    mat = confusion_matrix(y_test, labels)


    sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
                xticklabels=np.unique(y_train), yticklabels=np.unique(y_train))

    plt.xlabel('true label')
    plt.ylabel('predicted label')    
    plt.show()
    print('Accuracy score:', accuracy_score(y_test, labels))

    print("Execution time: %s seconds " % (time.time() - start_time))
    
    
    



In [118]:
df.dropna(inplace=True)

In [98]:
X = df.dialog.to_list()
count_vect = CountVectorizer()
X = count_vect.fit_transform(X)

In [119]:
X = df.dialog.to_list()
tf_idf_vect = TfidfVectorizer()
X = tf_idf_vect.fit_transform(X)

In [120]:
y = df['char']
y = pd.get_dummies(y)

In [121]:
y

Unnamed: 0,ARAGORN,FRODO,GANDALF,SAM
1,0,1,0,0
2,0,1,0,0
5,0,1,0,0
8,0,0,0,1
9,0,0,0,1
...,...,...,...,...
1508,0,0,1,0
1518,0,0,1,0
1526,0,0,1,0
1528,1,0,0,0


In [122]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8,random_state=1)

In [63]:
def evaluate_model(X, y, model):
    # define evaluation procedure
    cv = KFold(n_splits=5, shuffle=True, random_state=42)    # evaluate model
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return scores


In [58]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)] # Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] 
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
# bootstrap = [True, False]
oob_score = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features, 
               'max_depth': max_depth, 
               'min_samples_split': min_samples_split, 
               'min_samples_leaf': min_samples_leaf,
               'oob_score': oob_score
              }


In [59]:
rf = RandomForestClassifier(random_state = 42)

In [60]:
rf_random = RandomizedSearchCV(estimator=rf, 
                               param_distributions=random_grid, 
                               n_iter=100, 
                               cv=3, 
                               verbose=2,
                               random_state=42, 
                               n_jobs = -1)

In [61]:
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   29.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  8.1min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [62]:
rf_random.best_params_

{'oob_score': False,
 'n_estimators': 1577,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None}

In [123]:
best_rf = RandomForestClassifier(oob_score=False, 
                                 n_estimators=1577, 
                                 min_samples_split=10, 
                                 min_samples_leaf=1,
                                 max_features='sqrt',
                                 max_depth=None
                                )

In [124]:
best_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=1577,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [127]:
rfc_cv_score = cross_val_score(best_rf, X, y, cv=10, scoring='accuracy')

In [128]:
rfc_cv_score

array([0.44047619, 0.36904762, 0.46428571, 0.44047619, 0.42857143,
       0.38095238, 0.35714286, 0.18072289, 0.1686747 , 0.27710843])