In [40]:
import pandas as pd
import numpy as np
import nltk


from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression



In [119]:
df = pd.read_csv('4ch_west_wing_masked.csv')
del df["Unnamed: 0"]

df.head(3)

Unnamed: 0,Characters,Quotes
0,josh,Yeah. This is Josh Lyman. What’s going on?
1,toby,"You know when you guys say that, it sounds rid..."
2,toby,We’re flying in a Lockheed eagle series L-1011...


In [29]:
df = df.sort_values("Characters")

In [108]:
def corpus_builder(dataframe):
    
    num_of_char = len(dataframe["Characters"].unique())
    
    X = df["Quotes"].values.astype('U')
    y = df["Characters"].values.astype('U')
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)
    
    df_train = pd.DataFrame({"Quotes" : X_train, "Characters" : y_train})
    df_test = pd.DataFrame({"Quotes" : X_test, "Characters" : y_test})
    
    
    
    corpi_train_quotes = []
    corpi_train_chars = []

    corpi_test_quotes = []
    corpi_test_chars = []

    
    
    for i in range(num_of_char):
        loop_char = dataframe["Characters"].unique()[i]
        corpi_train_chars.append(loop_char)
        corpi_test_chars.append(loop_char)
        
        loop_corpus_train = df_train.loc[df_train.Characters == loop_char]
        loop_corpus_test = df_test[df.Characters == loop_char]
        
        loop_corpi_train_list = []
        loop_corpi_test_list = []

        
        for j in loop_corpus_train["Quotes"]:
            loop_corpi_train_list.append(j)
            
        for k in loop_corpus_test["Quotes"]:
            loop_corpi_test_list.append(k)
            
        loop_corpus_train = " ".join(loop_corpi_train_list)
        loop_corpus_test = " ".join(loop_corpi_test_list)
        
        corpi_train_quotes.append(loop_corpus_train)
        corpi_test_quotes.append(loop_corpus_test)
        
    corpi_train_quotes = np.asarray(corpi_train_quotes)
    corpi_train_chars = np.asarray(corpi_train_chars)
    corpi_test_quotes = np.asarray(corpi_test_quotes)
    corpi_test_chars = np.asarray(corpi_test_chars)
        
    
    return corpi_train_quotes, corpi_train_chars, corpi_test_quotes, corpi_test_chars

In [109]:
X_train, y_train, X_test, y_test = corpus_builder(df)



In [120]:
X = df["Quotes"].values.astype('U')
y = df["Characters"].values.astype('U')
print(X.shape, y.shape)

(16061,) (16061,)


In [121]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [122]:
X_train

array(["Well, there aren't going to be any people today. ",
       "Well, I don't approve of that.",
       'Speaking of classic conundrums...', ...,
       'How about we build the nine best schools in the world?',
       'Thank you. Is the pastrami from Krupins?',
       'It’s activist vacation is what it is. Spring break for anarchist wannabes. The black  t-shirts, the gas masks as fashion accessories. '],
      dtype='<U1597')

In [123]:
y_train

array(['bartlet', 'bartlet', 'c.j.', ..., 'c.j.', 'bartlet', 'toby'],
      dtype='<U7')

In [124]:
#  Naive Bayesian
text_clf = Pipeline([('vect', CountVectorizer(stop_words="english")), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.39193227091633465

In [125]:
#  SGD-SVM
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words="english")), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)
np.mean(predicted_svm == y_test)



0.386703187250996

In [128]:
# Random Forest
text_clf_RanFor = Pipeline([('vect', CountVectorizer(ngram_range=(1, 1))), ('tfidf', TfidfTransformer(use_idf=True)),
                         ('clf-RanFor', RandomForestClassifier(n_estimators=10))])

text_clf_RanFor = text_clf_RanFor.fit(X_train, y_train)
predicted_RanFor = text_clf_RanFor.predict(X_test)
np.mean(predicted_RanFor == y_test)

0.33117529880478086

In [129]:
# Log regression
text_clf_LogReg = Pipeline([('vect', CountVectorizer(ngram_range=(1, 1))), ('tfidf', TfidfTransformer(use_idf=True)),
                         ('clf-RanFor', LogisticRegression())])

text_clf_LogReg = text_clf_LogReg.fit(X_train, y_train)
predicted_LogReg = text_clf_LogReg.predict(X_test)
np.mean(predicted_LogReg == y_test)

0.3881972111553785

In [133]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

print(gs_clf.best_score_)
print(gs_clf.best_params_)

0.3817351598173516
{'clf__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 4)}


In [131]:
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)], 'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-2, 1e-3)}

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train, y_train)


print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

0.3878787878787879
{'clf-svm__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}




In [132]:
# Parameter-adjusted SDG-SVM
text_clf_svm = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))), ('tfidf', TfidfTransformer(use_idf=True)),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=.001, n_iter=4, random_state=42))])

text_clf_svm = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)
np.mean(predicted_svm == y_test)



0.40039840637450197

In [17]:
df

Unnamed: 0,Characters,Quotes
21739,bartlet,Get in the car.
21506,bartlet,That's why you want to say no?
21505,bartlet,I'd argue he's not a civilian. So would the At...
21504,bartlet,Do you not consider it relevant that it would ...
8747,bartlet,I imagine he’ll be along in a moment.
8748,bartlet,"Oh, yes."
8749,bartlet,"Good afternoon, Mrs. Landingham."
8750,bartlet,Go away right now.
8751,bartlet,Why don’t we go inside?
8752,bartlet,Blood Pressure 120/80.
