In [4]:
import pandas as pd
import numpy as np
import sklearn as sc
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder

df_shakespeare = pd.read_csv('../data/raw/Shakespeare_data.csv')
df_shakespeare.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [5]:
df_shakespeare['ActSceneLine'] = df_shakespeare['ActSceneLine'].astype(str)
del df_shakespeare['Dataline']

In [6]:
df_shakespeare['Player'].replace(np.nan, 'Other', inplace = True)
#replacing the NaN values to other because it isn't an actual character
df_shakespeare.head()

Unnamed: 0,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,Henry IV,,,Other,ACT I
1,Henry IV,,,Other,SCENE I. London. The palace.
2,Henry IV,,,Other,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [7]:
pd.DataFrame(df_shakespeare['Player'].unique().tolist(), columns = ['Characters'])

Unnamed: 0,Characters
0,Other
1,KING HENRY IV
2,WESTMORELAND
3,FALSTAFF
4,PRINCE HENRY
5,POINS
6,EARL OF WORCESTER
7,NORTHUMBERLAND
8,HOTSPUR
9,SIR WALTER BLUNT


Trying to figure out how to organize this data so it can be useful; I want to see the number of lines per player grouped by the play.

In [8]:
grouped = df_shakespeare.groupby(['Play', 'Player']).count()['PlayerLinenumber']
grouped

Play                      Player            
A Comedy of Errors        ADRIANA               284
                          AEGEON                150
                          AEMELIA                75
                          ANGELO                 99
                          ANTIPHOLUS              6
                          BALTHAZAR              31
                          Courtezan              43
                          DROMIO OF EPHESUS     191
                          DROMIO OF SYRACUSE    323
                          DUKE SOLINUS           97
                          First Merchant         19
                          Gaoler                  1
                          LUCE                   11
                          LUCIANA               118
                          OCTAVIUS CAESAR         3
                          OF EPHESUS            221
                          OF SYRACUSE           292
                          Officer                17
                   

In [9]:
#label encoder for value enumeration
#only encoding certain features right now because I'm not sure how important the other ones will 
#be for my classification purposes. 
le = LabelEncoder()
le.fit(df_shakespeare['Play'])
df_shakespeare['Play'] = le.transform(df_shakespeare['Play'])
le.fit(df_shakespeare['Player'])
df_shakespeare['Player'] = le.transform(df_shakespeare['Player'])
le.fit(df_shakespeare['ActSceneLine'])
df_shakespeare['ActSceneLine'] = le.transform(df_shakespeare['ActSceneLine'])
df_shakespeare.head()

Unnamed: 0,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,9,,16122,623,ACT I
1,9,,16122,623,SCENE I. London. The palace.
2,9,,16122,623,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,9,1.0,324,457,"So shaken as we are, so wan with care,"
4,9,1.0,435,457,"Find we a time for frighted peace to pant,"


In [14]:
#splitting the data between train and test on the 80/20 split like discussed in class. 
from sklearn.model_selection import train_test_split
labels = df_shakespeare['Player']
features = df_shakespeare[['Play' , 'ActSceneLine']]
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.20, 
                                                          random_state=42)

Using a random forest classifier. Not sure if this will lead to the most accuracy, but it's a
feasible start since I've never tackled classification previously. 

In [19]:
from sklearn.ensemble import RandomForestClassifier
#creating a random forest classifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
#Predicting the label of the new data set
prediction = model.predict(X_test)
print (prediction)

[169 402 141 ... 133 305 298]


The output of this cell doesn't mean much so it's probably a good idea to convert this to an actual measure of accuracy. For this we can use accuracy score from sklearn. 

In [20]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, prediction,))

0.6089766606822262


Better than 50% accuracy, so for now I will take that as a win. 
For the future, it would be cool to see if we can do language analysis and see if we can do predictive analysis based on the sentiment of the player's lines. 