In [1]:
import pandas as pd
import numpy as np
import nltk


from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression



In [14]:
df = pd.read_csv('4ch_simpsons.csv')
del df["Unnamed: 0"]

df.head(3)

Unnamed: 0,Characters,Quotes
0,homer,"baaart, i'm gonna open my mouth and close my e..."
1,bart,nobody better lay a finger on my butterfinger.
2,homer,what happened to the lights?


In [132]:
df = df.sort_values("Characters")

In [26]:
def corpus_builder(dataframe):
    
    num_of_char = len(dataframe["Characters"].unique())
    
    X = df["Quotes"].values.astype('U')
    y = df["Characters"].values.astype('U')
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)
    
    df_train = pd.DataFrame({"Quotes" : X_train, "Characters" : y_train})
    df_test = pd.DataFrame({"Quotes" : X_test, "Characters" : y_test})
    
    
    
    corpi_train_quotes = []
    corpi_train_chars = []

    corpi_test_quotes = []
    corpi_test_chars = []

    
    
    for i in range(num_of_char):
        loop_char = dataframe["Characters"].unique()[i]
        corpi_train_chars.append(loop_char)
        corpi_test_chars.append(loop_char)
        
        loop_corpus_train = df_train.loc[df_train.Characters == loop_char]
        loop_corpus_test = df_test[df.Characters == loop_char]
        
        loop_corpi_train_list = []
        loop_corpi_test_list = []

        
        for j in loop_corpus_train["Quotes"]:
            loop_corpi_train_list.append(j)
            
        for k in loop_corpus_test["Quotes"]:
            loop_corpi_test_list.append(k)
            
        loop_corpus_train = " ".join(loop_corpi_train_list)
        loop_corpus_test = " ".join(loop_corpi_test_list)
        
        corpi_train_quotes.append(loop_corpus_train)
        corpi_test_quotes.append(loop_corpus_test)
        
    corpi_train_quotes = np.asarray(corpi_train_quotes)
    corpi_train_chars = np.asarray(corpi_train_chars)
    corpi_test_quotes = np.asarray(corpi_test_quotes)
    corpi_test_chars = np.asarray(corpi_test_chars)
        
    
    return corpi_train_quotes, corpi_train_chars, corpi_test_quotes, corpi_test_chars

In [27]:
X_train, y_train, X_test, y_test = corpus_builder(df)



In [35]:
X = df["Quotes"].values.astype('U')
y = df["Characters"].values.astype('U')
print(X.shape, y.shape)

(13783,) (13783,)


In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
X_train

array(["dad, you cannot wear that! that's a rastafarian hat.",
       'i need a shirt!  gimmie a shirt!', 'what happened to mindy?', ...,
       'ooh!  ', 'nan',
       'maybe the tomato seeds crossbred with the tobacco seeds.'],
      dtype='<U73')

In [6]:
y_train

array(['bart', 'homer', 'bart', ..., 'homer', 'marge', 'lisa'],
      dtype='<U5')

In [37]:
#  Naive Bayesian
text_clf = Pipeline([('vect', CountVectorizer(stop_words="english")), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.4843296575739988

In [38]:
#  SGD-SVM
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words="english")), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)
np.mean(predicted_svm == y_test)



0.5168311085316308

In [39]:
# Random Forest
text_clf_RanFor = Pipeline([('vect', CountVectorizer(ngram_range=(1, 1))), ('tfidf', TfidfTransformer(use_idf=True)),
                         ('clf-RanFor', RandomForestClassifier(n_estimators=10))])

text_clf_RanFor = text_clf_RanFor.fit(X_train, y_train)
predicted_RanFor = text_clf_RanFor.predict(X_test)
np.mean(predicted_RanFor == y_test)

0.4762042948345908

In [40]:
# Linear regression
text_clf_LogReg = Pipeline([('vect', CountVectorizer(ngram_range=(1, 1))), ('tfidf', TfidfTransformer(use_idf=True)),
                         ('clf-RanFor', LogisticRegression())])

text_clf_LogReg = text_clf_LogReg.fit(X_train, y_train)
predicted_LogReg = text_clf_LogReg.predict(X_test)
np.mean(predicted_LogReg == y_test)

0.524376088218224

In [41]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

print(gs_clf.best_score_)
print(gs_clf.best_params_)

0.4712198897165522
{'clf__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 5)}


In [42]:
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)], 'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-2, 1e-3)}

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train, y_train)


print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

0.5141723904421012
{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}




In [43]:
# Parameter-adjusted SDG-SVM
text_clf_svm = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))), ('tfidf', TfidfTransformer(use_idf=True)),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=.001, n_iter=4, random_state=42))])

text_clf_svm = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)
np.mean(predicted_svm == y_test)



0.5249564712710388

In [44]:
df

Unnamed: 0,Characters,Quotes
0,homer,"baaart, i'm gonna open my mouth and close my e..."
1,bart,nobody better lay a finger on my butterfinger.
2,homer,what happened to the lights?
3,lisa,"it's just a power outage, dad."
4,lisa,it's just bart eating a butterfinger!
5,homer,hmmm...and it sounds like he's right over...
6,homer,"aw, how 'bout a bite, little buddy?"
7,bart,"oh, okay, homer. just one."
8,homer,"hey, this tastes just like a ... dog biscuit!"
9,bart,nobody better lay a finger on my butterfinger!
