In [1]:
import pandas as pd
from sklearn.metrics import confusion_matrix

In [2]:
simpsons = pd.read_csv('simpsons_dataset.csv')
simpsons.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [3]:
selection = simpsons.loc[(simpsons["raw_character_text"] == 'Bart Simpson') | (simpsons["raw_character_text"] == 'Lisa Simpson')]

In [4]:
selection.head()

Unnamed: 0,raw_character_text,spoken_words
1,Lisa Simpson,Where's Mr. Bergstrom?
3,Lisa Simpson,That life is worth living.
7,Bart Simpson,Victory party under the slide!
9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!
11,Lisa Simpson,Do you know where I could find him?


In [5]:
from sklearn.feature_extraction.text import CountVectorizer #The CountVectorizer object

text = selection['spoken_words'].values.astype('U') #Taking the text from the df. We need to convert it to Unicode

vect = CountVectorizer(stop_words='english') #Create the CV object, with English stop words
vect = vect.fit(text) #We fit the model with the words from the review text
feature_names = vect.get_feature_names() #Get the words from the vocabulary
print(f"There are {len(feature_names)} words in the vocabulary. A selection: {feature_names[500:520]}")

There are 14258 words in the vocabulary. A selection: ['anguished', 'angus', 'anima', 'animal', 'animals', 'animated', 'animation', 'animators', 'anka', 'ankle', 'ann', 'annapolis', 'anne', 'annie', 'anniversary', 'annnnd', 'announce', 'announcement', 'announcements', 'announcer']


In [6]:
matrix = vect.transform(text) #The transform method from the CountVectorizer object creates the matrix
print(matrix[0:50,0:50]) #Let's print a little part of the matrix: the first 50 words & documents




In [7]:
docu_feat = pd.DataFrame(matrix.toarray()) #make a regular matrix, then put in Dataframe
docu_feat.index = selection['spoken_words'] #Give the rows names (text of the review)
docu_feat.columns = feature_names #Give the columns names (words from vocabulary)

In [8]:
docu_feat.iloc[0:4, 1000:1015] #Show a part of the matrix

Unnamed: 0_level_0,bartholemew,bartholomew,bartish,bartman,barto,bartrand,bartron,barts,barty,bas,base,baseball,based,basement,basements
spoken_words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Where's Mr. Bergstrom?,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
That life is worth living.,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Victory party under the slide!,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Mr. Bergstrom! Mr. Bergstrom!,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [40]:
from sklearn.linear_model import LinearRegression #the model
from sklearn.model_selection import train_test_split #the function to split the data

#Setting up the data and model
X = matrix #selecting the variables to go into my X matrix
y = selection['raw_character_text'] #creating the y vector

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) #split the data and store it




In [41]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [42]:
y_test_pred = model.predict(X_test)

In [43]:
print(y_test_pred)

['Bart Simpson' 'Bart Simpson' 'Lisa Simpson' ... 'Bart Simpson'
 'Bart Simpson' 'Lisa Simpson']


In [44]:
accuracy = model.score(X_test, y_test)

In [45]:
print(f'The accuracy is: {accuracy}' )

The accuracy is: 0.6417161716171618


In [46]:
selection['class'] = model.predict(X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
