# Text Analytics of The Grimm Brother's Tales
 
## Supervised Analysis

In [1]:
import pandas as pd
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt

In [5]:
text_ff = pd.read_pickle("..\data\grimm_tales_ff.pckl")
text_ff.shape

(44, 3779)

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
train, test = train_test_split(text_ff, test_size=0.30, random_state=4, shuffle=False)

In [8]:
print(train.shape)
print(test.shape)

(30, 3779)
(14, 3779)


In [16]:

train.groupby(train['label']).count()['certain']

label
Animal Tales      11
Tales of Magic    19
Name: certain, dtype: int64

In [17]:
test.groupby(test['label']).count()['certain']

label
Animal Tales      7
Tales of Magic    7
Name: certain, dtype: int64

In [18]:
X_train = train.iloc[:,0:(train.shape[1]-1)]
X_test = test.iloc[:,0:(test.shape[1]-1)]

In [19]:
train['label'] = pd.Categorical(train['label'])
y_train = train['label'].cat.codes

test['label'] = pd.Categorical(test['label'])
y_test = test['label'].cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


## Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [21]:
mod1 = LogisticRegression(max_iter=10000)
mod1.fit(X_train,y_train)

yhat1 = mod1.predict(X_train)
#yprob1 = mod1.predict_proba(X_train)
print("Train")
print(pd.crosstab(y_train,yhat1))
print("---------------------------------")
yhat2 = mod1.predict(X_test)
print("Test")
print(pd.crosstab(y_test,yhat2))

print("\n")
### Accuracy:
print("Accuracy - train: %f" % mod1.score(X_train,y_train))
print("Accuracy - test: %f" % mod1.score(X_test,y_test))

### AUC
print("AUC - train: %f" % roc_auc_score(y_train, yhat1))
print("AUC - test: %f" % roc_auc_score(y_test, yhat2))

Train
col_0   0   1
row_0        
0      11   0
1       0  19
---------------------------------
Test
col_0  0  1
row_0      
0      6  1
1      1  6


Accuracy - train: 1.000000
Accuracy - test: 0.857143
AUC - train: 1.000000
AUC - test: 0.857143


In [22]:
res = {1:{'model':'Logistic Regression - base model',
           'accuracy-Train':mod1.score(X_train,y_train),
           'accuracy-Test':mod1.score(X_test,y_test),
           'AUC-train':roc_auc_score(y_train, yhat1),
           'AUC-test':roc_auc_score(y_test, yhat2)
      }}

In [23]:
from sklearn.tree import DecisionTreeClassifier

mod1 = DecisionTreeClassifier()
mod1.fit(X_train,y_train)

yhat1 = mod1.predict(X_train)
#yprob1 = mod1.predict_proba(X_train)
print("Train")
print(pd.crosstab(y_train,yhat1))
print("---------------------------------")
yhat2 = mod1.predict(X_test)
print("Test")
print(pd.crosstab(y_test,yhat2))

print("\n")
### Accuracy:
print("Accuracy - train: %f" % mod1.score(X_train,y_train))
print("Accuracy - test: %f" % mod1.score(X_test,y_test))

### AUC
print("AUC - train: %f" % roc_auc_score(y_train, yhat1))
print("AUC - test: %f" % roc_auc_score(y_test, yhat2))

Train
col_0   0   1
row_0        
0      11   0
1       0  19
---------------------------------
Test
col_0  0  1
row_0      
0      2  5
1      0  7


Accuracy - train: 1.000000
Accuracy - test: 0.642857
AUC - train: 1.000000
AUC - test: 0.642857


In [24]:
res[2] = {'model':'Logistic DecisionTreeClassifier - base model',
           'accuracy-Train':mod1.score(X_train,y_train),
           'accuracy-Test':mod1.score(X_test,y_test),
           'AUC-train':roc_auc_score(y_train, yhat1),
           'AUC-test':roc_auc_score(y_test, yhat2)
      }

In [25]:
from sklearn.ensemble import RandomForestClassifier

mod1 = RandomForestClassifier()
mod1.fit(X_train,y_train)

yhat1 = mod1.predict(X_train)
#yprob1 = mod1.predict_proba(X_train)
print("Train")
print(pd.crosstab(y_train,yhat1))
print("---------------------------------")
yhat2 = mod1.predict(X_test)
print("Test")
print(pd.crosstab(y_test,yhat2))

print("\n")
### Accuracy:
print("Accuracy - train: %f" % mod1.score(X_train,y_train))
print("Accuracy - test: %f" % mod1.score(X_test,y_test))

### AUC
print("AUC - train: %f" % roc_auc_score(y_train, yhat1))
print("AUC - test: %f" % roc_auc_score(y_test, yhat2))

Train
col_0   0   1
row_0        
0      11   0
1       0  19
---------------------------------
Test
col_0  0  1
row_0      
0      7  0
1      0  7


Accuracy - train: 1.000000
Accuracy - test: 1.000000
AUC - train: 1.000000
AUC - test: 1.000000


In [26]:
res[3] = {'model':'RandomForestClassifier - base model',
           'accuracy-Train':mod1.score(X_train,y_train),
           'accuracy-Test':mod1.score(X_test,y_test),
           'AUC-train':roc_auc_score(y_train, yhat1),
           'AUC-test':roc_auc_score(y_test, yhat2)
      }

In [27]:
from sklearn.ensemble import AdaBoostClassifier

mod1 = AdaBoostClassifier()
mod1.fit(X_train,y_train)

yhat1 = mod1.predict(X_train)
#yprob1 = mod1.predict_proba(X_train)
print("Train")
print(pd.crosstab(y_train,yhat1))
print("---------------------------------")
yhat2 = mod1.predict(X_test)
print("Test")
print(pd.crosstab(y_test,yhat2))

print("\n")
### Accuracy:
print("Accuracy - train: %f" % mod1.score(X_train,y_train))
print("Accuracy - test: %f" % mod1.score(X_test,y_test))

### AUC
print("AUC - train: %f" % roc_auc_score(y_train, yhat1))
print("AUC - test: %f" % roc_auc_score(y_test, yhat2))

res[4] = {'model':'AdaBoostClassifier - base model',
           'accuracy-Train':mod1.score(X_train,y_train),
           'accuracy-Test':mod1.score(X_test,y_test),
           'AUC-train':roc_auc_score(y_train, yhat1),
           'AUC-test':roc_auc_score(y_test, yhat2)
      }

Train
col_0   0   1
row_0        
0      11   0
1       0  19
---------------------------------
Test
col_0  0  1
row_0      
0      2  5
1      0  7


Accuracy - train: 1.000000
Accuracy - test: 0.642857
AUC - train: 1.000000
AUC - test: 0.642857


In [28]:
from sklearn.ensemble import GradientBoostingClassifier

mod1 = GradientBoostingClassifier()
mod1.fit(X_train,y_train)

yhat1 = mod1.predict(X_train)
#yprob1 = mod1.predict_proba(X_train)
print("Train")
print(pd.crosstab(y_train,yhat1))
print("---------------------------------")
yhat2 = mod1.predict(X_test)
print("Test")
print(pd.crosstab(y_test,yhat2))

print("\n")
### Accuracy:
print("Accuracy - train: %f" % mod1.score(X_train,y_train))
print("Accuracy - test: %f" % mod1.score(X_test,y_test))

### AUC
print("AUC - train: %f" % roc_auc_score(y_train, yhat1))
print("AUC - test: %f" % roc_auc_score(y_test, yhat2))

res[5] = {'model':'GradientBoostingClassifier - base model',
           'accuracy-Train':mod1.score(X_train,y_train),
           'accuracy-Test':mod1.score(X_test,y_test),
           'AUC-train':roc_auc_score(y_train, yhat1),
           'AUC-test':roc_auc_score(y_test, yhat2)
      }

Train
col_0   0   1
row_0        
0      11   0
1       0  19
---------------------------------
Test
col_0  0  1
row_0      
0      2  5
1      0  7


Accuracy - train: 1.000000
Accuracy - test: 0.642857
AUC - train: 1.000000
AUC - test: 0.642857


In [29]:
from sklearn.svm import SVC

mod1 = SVC()
mod1.fit(X_train,y_train)

yhat1 = mod1.predict(X_train)
#yprob1 = mod1.predict_proba(X_train)
print("Train")
print(pd.crosstab(y_train,yhat1))
print("---------------------------------")
yhat2 = mod1.predict(X_test)
print("Test")
print(pd.crosstab(y_test,yhat2))

print("\n")
### Accuracy:
print("Accuracy - train: %f" % mod1.score(X_train,y_train))
print("Accuracy - test: %f" % mod1.score(X_test,y_test))

### AUC
print("AUC - train: %f" % roc_auc_score(y_train, yhat1))
print("AUC - test: %f" % roc_auc_score(y_test, yhat2))

res[6] = {'model':'SVM - base model',
           'accuracy-Train':mod1.score(X_train,y_train),
           'accuracy-Test':mod1.score(X_test,y_test),
           'AUC-train':roc_auc_score(y_train, yhat1),
           'AUC-test':roc_auc_score(y_test, yhat2)
      }

Train
col_0   0   1
row_0        
0      11   0
1       1  18
---------------------------------
Test
col_0  0  1
row_0      
0      6  1
1      2  5


Accuracy - train: 0.966667
Accuracy - test: 0.785714
AUC - train: 0.973684
AUC - test: 0.785714


In [30]:
from sklearn.neighbors import KNeighborsClassifier

mod1 = KNeighborsClassifier(metric='cosine')
mod1.fit(X_train,y_train)

yhat1 = mod1.predict(X_train)
#yprob1 = mod1.predict_proba(X_train)
print("Train")
print(pd.crosstab(y_train,yhat1))
print("---------------------------------")
yhat2 = mod1.predict(X_test)
print("Test")
print(pd.crosstab(y_test,yhat2))

print("\n")
### Accuracy:
print("Accuracy - train: %f" % mod1.score(X_train,y_train))
print("Accuracy - test: %f" % mod1.score(X_test,y_test))

### AUC
print("AUC - train: %f" % roc_auc_score(y_train, yhat1))
print("AUC - test: %f" % roc_auc_score(y_test, yhat2))

res[7] = {'model':'KNeighborsClassifier - base model',
           'accuracy-Train':mod1.score(X_train,y_train),
           'accuracy-Test':mod1.score(X_test,y_test),
           'AUC-train':roc_auc_score(y_train, yhat1),
           'AUC-test':roc_auc_score(y_test, yhat2)
      }

Train
col_0  0   1
row_0       
0      2   9
1      0  19
---------------------------------
Test
col_0  0  1
row_0      
0      2  5
1      0  7


Accuracy - train: 0.700000
Accuracy - test: 0.642857
AUC - train: 0.590909
AUC - test: 0.642857


In [31]:
res

{1: {'model': 'Logistic Regression - base model',
  'accuracy-Train': 1.0,
  'accuracy-Test': 0.8571428571428571,
  'AUC-train': 1.0,
  'AUC-test': 0.8571428571428572},
 2: {'model': 'Logistic DecisionTreeClassifier - base model',
  'accuracy-Train': 1.0,
  'accuracy-Test': 0.6428571428571429,
  'AUC-train': 1.0,
  'AUC-test': 0.6428571428571428},
 3: {'model': 'RandomForestClassifier - base model',
  'accuracy-Train': 1.0,
  'accuracy-Test': 1.0,
  'AUC-train': 1.0,
  'AUC-test': 1.0},
 4: {'model': 'AdaBoostClassifier - base model',
  'accuracy-Train': 1.0,
  'accuracy-Test': 0.6428571428571429,
  'AUC-train': 1.0,
  'AUC-test': 0.6428571428571428},
 5: {'model': 'GradientBoostingClassifier - base model',
  'accuracy-Train': 1.0,
  'accuracy-Test': 0.6428571428571429,
  'AUC-train': 1.0,
  'AUC-test': 0.6428571428571428},
 6: {'model': 'SVM - base model',
  'accuracy-Train': 0.9666666666666667,
  'accuracy-Test': 0.7857142857142857,
  'AUC-train': 0.9736842105263157,
  'AUC-test': 0.

In [32]:
res2 = pd.DataFrame(res)
res2

Unnamed: 0,1,2,3,4,5,6,7
model,Logistic Regression - base model,Logistic DecisionTreeClassifier - base model,RandomForestClassifier - base model,AdaBoostClassifier - base model,GradientBoostingClassifier - base model,SVM - base model,KNeighborsClassifier - base model
accuracy-Train,1,1,1,1,1,0.966667,0.7
accuracy-Test,0.857143,0.642857,1,0.642857,0.642857,0.785714,0.642857
AUC-train,1,1,1,1,1,0.973684,0.590909
AUC-test,0.857143,0.642857,1,0.642857,0.642857,0.785714,0.642857


In [33]:
res2.transpose()

Unnamed: 0,model,accuracy-Train,accuracy-Test,AUC-train,AUC-test
1,Logistic Regression - base model,1.0,0.857143,1.0,0.857143
2,Logistic DecisionTreeClassifier - base model,1.0,0.642857,1.0,0.642857
3,RandomForestClassifier - base model,1.0,1.0,1.0,1.0
4,AdaBoostClassifier - base model,1.0,0.642857,1.0,0.642857
5,GradientBoostingClassifier - base model,1.0,0.642857,1.0,0.642857
6,SVM - base model,0.966667,0.785714,0.973684,0.785714
7,KNeighborsClassifier - base model,0.7,0.642857,0.590909,0.642857
