# Text Analytics of The Grimm Brother's Tales
 
## Supervised Analysis

In [1]:
import pandas as pd
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt

In [2]:
text_ff = pd.read_pickle("grimm_tales_ff.pckl")
text_ff.shape

(42, 3910)

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
train, test = train_test_split(text_ff, test_size=0.30, random_state=4, shuffle=False)

In [5]:
print(train.shape)
print(test.shape)

(29, 3910)
(13, 3910)


In [6]:
train.groupby(train['label']).count()['A']

label
Animal Tales      12
Tales of Magic    17
Name: A, dtype: int64

In [7]:
test.groupby(test['label']).count()['A']

label
Animal Tales      6
Tales of Magic    7
Name: A, dtype: int64

In [8]:
X_train = train.iloc[:,0:(train.shape[1]-1)]
X_test = test.iloc[:,0:(test.shape[1]-1)]

In [9]:
train['label'] = pd.Categorical(train['label'])
y_train = train['label'].cat.codes

test['label'] = pd.Categorical(test['label'])
y_test = test['label'].cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


## Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [11]:
mod1 = LogisticRegression(max_iter=10000)
mod1.fit(X_train,y_train)

yhat1 = mod1.predict(X_train)
#yprob1 = mod1.predict_proba(X_train)
print("Train")
print(pd.crosstab(y_train,yhat1))
print("---------------------------------")
yhat2 = mod1.predict(X_test)
print("Test")
print(pd.crosstab(y_test,yhat2))

print("\n")
### Accuracy:
print("Accuracy - train: %f" % mod1.score(X_train,y_train))
print("Accuracy - test: %f" % mod1.score(X_test,y_test))

### AUC
print("AUC - train: %f" % roc_auc_score(y_train, yhat1))
print("AUC - test: %f" % roc_auc_score(y_test, yhat2))

Train
col_0   0   1
row_0        
0      12   0
1       0  17
---------------------------------
Test
col_0  0  1
row_0      
0      5  1
1      2  5


Accuracy - train: 1.000000
Accuracy - test: 0.769231
AUC - train: 1.000000
AUC - test: 0.773810


In [12]:
res = {1:{'model':'Logistic Regression - base model',
           'accuracy-Train':mod1.score(X_train,y_train),
           'accuracy-Test':mod1.score(X_test,y_test),
           'AUC-train':roc_auc_score(y_train, yhat1),
           'AUC-test':roc_auc_score(y_test, yhat2)
      }}

In [13]:
from sklearn.tree import DecisionTreeClassifier

mod1 = DecisionTreeClassifier()
mod1.fit(X_train,y_train)

yhat1 = mod1.predict(X_train)
#yprob1 = mod1.predict_proba(X_train)
print("Train")
print(pd.crosstab(y_train,yhat1))
print("---------------------------------")
yhat2 = mod1.predict(X_test)
print("Test")
print(pd.crosstab(y_test,yhat2))

print("\n")
### Accuracy:
print("Accuracy - train: %f" % mod1.score(X_train,y_train))
print("Accuracy - test: %f" % mod1.score(X_test,y_test))

### AUC
print("AUC - train: %f" % roc_auc_score(y_train, yhat1))
print("AUC - test: %f" % roc_auc_score(y_test, yhat2))

Train
col_0   0   1
row_0        
0      12   0
1       0  17
---------------------------------
Test
col_0  0  1
row_0      
0      1  5
1      1  6


Accuracy - train: 1.000000
Accuracy - test: 0.538462
AUC - train: 1.000000
AUC - test: 0.511905


In [14]:
res[2] = {'model':'Logistic DecisionTreeClassifier - base model',
           'accuracy-Train':mod1.score(X_train,y_train),
           'accuracy-Test':mod1.score(X_test,y_test),
           'AUC-train':roc_auc_score(y_train, yhat1),
           'AUC-test':roc_auc_score(y_test, yhat2)
      }

In [15]:
from sklearn.ensemble import RandomForestClassifier

mod1 = RandomForestClassifier()
mod1.fit(X_train,y_train)

yhat1 = mod1.predict(X_train)
#yprob1 = mod1.predict_proba(X_train)
print("Train")
print(pd.crosstab(y_train,yhat1))
print("---------------------------------")
yhat2 = mod1.predict(X_test)
print("Test")
print(pd.crosstab(y_test,yhat2))

print("\n")
### Accuracy:
print("Accuracy - train: %f" % mod1.score(X_train,y_train))
print("Accuracy - test: %f" % mod1.score(X_test,y_test))

### AUC
print("AUC - train: %f" % roc_auc_score(y_train, yhat1))
print("AUC - test: %f" % roc_auc_score(y_test, yhat2))

Train
col_0   0   1
row_0        
0      12   0
1       0  17
---------------------------------
Test
col_0  0  1
row_0      
0      5  1
1      2  5


Accuracy - train: 1.000000
Accuracy - test: 0.769231
AUC - train: 1.000000
AUC - test: 0.773810


In [16]:
res[3] = {'model':'RandomForestClassifier - base model',
           'accuracy-Train':mod1.score(X_train,y_train),
           'accuracy-Test':mod1.score(X_test,y_test),
           'AUC-train':roc_auc_score(y_train, yhat1),
           'AUC-test':roc_auc_score(y_test, yhat2)
      }

In [17]:
from sklearn.ensemble import AdaBoostClassifier

mod1 = AdaBoostClassifier()
mod1.fit(X_train,y_train)

yhat1 = mod1.predict(X_train)
#yprob1 = mod1.predict_proba(X_train)
print("Train")
print(pd.crosstab(y_train,yhat1))
print("---------------------------------")
yhat2 = mod1.predict(X_test)
print("Test")
print(pd.crosstab(y_test,yhat2))

print("\n")
### Accuracy:
print("Accuracy - train: %f" % mod1.score(X_train,y_train))
print("Accuracy - test: %f" % mod1.score(X_test,y_test))

### AUC
print("AUC - train: %f" % roc_auc_score(y_train, yhat1))
print("AUC - test: %f" % roc_auc_score(y_test, yhat2))

res[4] = {'model':'AdaBoostClassifier - base model',
           'accuracy-Train':mod1.score(X_train,y_train),
           'accuracy-Test':mod1.score(X_test,y_test),
           'AUC-train':roc_auc_score(y_train, yhat1),
           'AUC-test':roc_auc_score(y_test, yhat2)
      }

Train
col_0   0   1
row_0        
0      12   0
1       0  17
---------------------------------
Test
col_0  0  1
row_0      
0      1  5
1      1  6


Accuracy - train: 1.000000
Accuracy - test: 0.538462
AUC - train: 1.000000
AUC - test: 0.511905


In [18]:
from sklearn.ensemble import GradientBoostingClassifier

mod1 = GradientBoostingClassifier()
mod1.fit(X_train,y_train)

yhat1 = mod1.predict(X_train)
#yprob1 = mod1.predict_proba(X_train)
print("Train")
print(pd.crosstab(y_train,yhat1))
print("---------------------------------")
yhat2 = mod1.predict(X_test)
print("Test")
print(pd.crosstab(y_test,yhat2))

print("\n")
### Accuracy:
print("Accuracy - train: %f" % mod1.score(X_train,y_train))
print("Accuracy - test: %f" % mod1.score(X_test,y_test))

### AUC
print("AUC - train: %f" % roc_auc_score(y_train, yhat1))
print("AUC - test: %f" % roc_auc_score(y_test, yhat2))

res[5] = {'model':'GradientBoostingClassifier - base model',
           'accuracy-Train':mod1.score(X_train,y_train),
           'accuracy-Test':mod1.score(X_test,y_test),
           'AUC-train':roc_auc_score(y_train, yhat1),
           'AUC-test':roc_auc_score(y_test, yhat2)
      }

Train
col_0   0   1
row_0        
0      12   0
1       0  17
---------------------------------
Test
col_0  0  1
row_0      
0      1  5
1      1  6


Accuracy - train: 1.000000
Accuracy - test: 0.538462
AUC - train: 1.000000
AUC - test: 0.511905


In [19]:
from sklearn.svm import SVC

mod1 = SVC()
mod1.fit(X_train,y_train)

yhat1 = mod1.predict(X_train)
#yprob1 = mod1.predict_proba(X_train)
print("Train")
print(pd.crosstab(y_train,yhat1))
print("---------------------------------")
yhat2 = mod1.predict(X_test)
print("Test")
print(pd.crosstab(y_test,yhat2))

print("\n")
### Accuracy:
print("Accuracy - train: %f" % mod1.score(X_train,y_train))
print("Accuracy - test: %f" % mod1.score(X_test,y_test))

### AUC
print("AUC - train: %f" % roc_auc_score(y_train, yhat1))
print("AUC - test: %f" % roc_auc_score(y_test, yhat2))

res[6] = {'model':'SVM - base model',
           'accuracy-Train':mod1.score(X_train,y_train),
           'accuracy-Test':mod1.score(X_test,y_test),
           'AUC-train':roc_auc_score(y_train, yhat1),
           'AUC-test':roc_auc_score(y_test, yhat2)
      }

Train
col_0  0   1
row_0       
0      8   4
1      1  16
---------------------------------
Test
col_0  0  1
row_0      
0      5  1
1      3  4


Accuracy - train: 0.827586
Accuracy - test: 0.692308
AUC - train: 0.803922
AUC - test: 0.702381


In [25]:
from sklearn.neighbors import KNeighborsClassifier

mod1 = KNeighborsClassifier(metric='cosine')
mod1.fit(X_train,y_train)

yhat1 = mod1.predict(X_train)
#yprob1 = mod1.predict_proba(X_train)
print("Train")
print(pd.crosstab(y_train,yhat1))
print("---------------------------------")
yhat2 = mod1.predict(X_test)
print("Test")
print(pd.crosstab(y_test,yhat2))

print("\n")
### Accuracy:
print("Accuracy - train: %f" % mod1.score(X_train,y_train))
print("Accuracy - test: %f" % mod1.score(X_test,y_test))

### AUC
print("AUC - train: %f" % roc_auc_score(y_train, yhat1))
print("AUC - test: %f" % roc_auc_score(y_test, yhat2))

res[7] = {'model':'KNeighborsClassifier - base model',
           'accuracy-Train':mod1.score(X_train,y_train),
           'accuracy-Test':mod1.score(X_test,y_test),
           'AUC-train':roc_auc_score(y_train, yhat1),
           'AUC-test':roc_auc_score(y_test, yhat2)
      }

Train
col_0  0   1
row_0       
0      5   7
1      0  17
---------------------------------
Test
col_0  0  1
row_0      
0      1  5
1      0  7


Accuracy - train: 0.758621
Accuracy - test: 0.615385
AUC - train: 0.708333
AUC - test: 0.583333


In [26]:
res

{1: {'model': 'Logistic Regression - base model',
  'accuracy-Train': 1.0,
  'accuracy-Test': 0.7692307692307693,
  'AUC-train': 1.0,
  'AUC-test': 0.773809523809524},
 2: {'model': 'Logistic DecisionTreeClassifier - base model',
  'accuracy-Train': 1.0,
  'accuracy-Test': 0.5384615384615384,
  'AUC-train': 1.0,
  'AUC-test': 0.5119047619047619},
 3: {'model': 'RandomForestClassifier - base model',
  'accuracy-Train': 1.0,
  'accuracy-Test': 0.7692307692307693,
  'AUC-train': 1.0,
  'AUC-test': 0.773809523809524},
 4: {'model': 'AdaBoostClassifier - base model',
  'accuracy-Train': 1.0,
  'accuracy-Test': 0.5384615384615384,
  'AUC-train': 1.0,
  'AUC-test': 0.5119047619047619},
 5: {'model': 'GradientBoostingClassifier - base model',
  'accuracy-Train': 1.0,
  'accuracy-Test': 0.5384615384615384,
  'AUC-train': 1.0,
  'AUC-test': 0.5119047619047619},
 6: {'model': 'SVM - base model',
  'accuracy-Train': 0.8275862068965517,
  'accuracy-Test': 0.6923076923076923,
  'AUC-train': 0.803921

In [27]:
res2 = pd.DataFrame(res)
res2

Unnamed: 0,1,2,3,4,5,6,7
model,Logistic Regression - base model,Logistic DecisionTreeClassifier - base model,RandomForestClassifier - base model,AdaBoostClassifier - base model,GradientBoostingClassifier - base model,SVM - base model,KNeighborsClassifier - base model
accuracy-Train,1,1,1,1,1,0.827586,0.758621
accuracy-Test,0.769231,0.538462,0.769231,0.538462,0.538462,0.692308,0.615385
AUC-train,1,1,1,1,1,0.803922,0.708333
AUC-test,0.77381,0.511905,0.77381,0.511905,0.511905,0.702381,0.583333


In [28]:
res2.transpose()

Unnamed: 0,model,accuracy-Train,accuracy-Test,AUC-train,AUC-test
1,Logistic Regression - base model,1.0,0.769231,1.0,0.77381
2,Logistic DecisionTreeClassifier - base model,1.0,0.538462,1.0,0.511905
3,RandomForestClassifier - base model,1.0,0.769231,1.0,0.77381
4,AdaBoostClassifier - base model,1.0,0.538462,1.0,0.511905
5,GradientBoostingClassifier - base model,1.0,0.538462,1.0,0.511905
6,SVM - base model,0.827586,0.692308,0.803922,0.702381
7,KNeighborsClassifier - base model,0.758621,0.615385,0.708333,0.583333
