In [1]:
import pandas as pd
import numpy as np
import nltk


from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline



In [2]:
df = pd.read_csv('clean_4characters_tv.csv')
del df["Unnamed: 0"]

df.head(3)

Unnamed: 0,Quotes,Characters
0,Space It seems to go on and on forever But the...,Fry
1,And thats how you play the game,Fry
2,Michelle baby Where you going,Fry


In [3]:
X = df["Quotes"].values.astype('U')
y = df["Characters"].values.astype('U')
print(X.shape, y.shape)

(9104,) (9104,)


In [4]:
count_vect = CountVectorizer()

X_counts = count_vect.fit_transform(X)
y_counts = count_vect.fit_transform(y)

print(X_counts.shape)
print(y_counts.shape)

(9104, 10866)
(9104, 4)


In [5]:
tfidf_transformer = TfidfTransformer()

X_tfidf = tfidf_transformer.fit_transform(X_counts)
y_tfidf = tfidf_transformer.fit_transform(y_counts)

print(X_tfidf.shape)
print(y_tfidf.shape)

(9104, 10866)
(9104, 4)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [7]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words="english")), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(X_train, y_train)

In [8]:
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.43717047451669594

In [9]:
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words="english")), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)
np.mean(predicted_svm == y_test)



0.4679261862917399

In [10]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

In [11]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

In [12]:
print(gs_clf.best_score_)
print(gs_clf.best_params_)

0.4209138840070299
{'clf__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}


In [13]:
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-2, 1e-3)}

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train, y_train)


print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

0.4443468072642062
{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}




In [14]:
text_clf_svm = Pipeline([('vect', CountVectorizer(ngram_range=(1, 1))), ('tfidf', TfidfTransformer(use_idf=True)),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=.01, n_iter=4, random_state=42))])

text_clf_svm = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)
np.mean(predicted_svm == y_test)



0.4833040421792619