In [23]:
import pandas as pd, numpy as np
from fuzzywuzzy import process, fuzz
import matplotlib.pyplot as plt
import seaborn as sns
import string 
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
training = pd.read_csv('cleaned_latotale.tsv', sep="\t")
training

Unnamed: 0,unittitle,function
0,"Adrien, Veuve Timothée",Personne
1,"Aigron, Demoiselle",Personne
2,"Albert, Jean",Personne
3,"Aldin, Jean (alias Alding)",Personne
4,"Alizeau, François",Personne
...,...,...
10873,Compagnie de la mer du Sud,Affaire
10874,Croix de vie,Affaire
10875,Invalides,Affaire
10876,"Oléron, ile d’",Affaire


In [3]:
training['function'] = training.apply(lambda x: 0 if x['function']=='Personne' else x['function'], axis=1)
training['function'] = training.apply(lambda x: 1 if x['function']=='Bateau' else x['function'], axis=1)
training['function'] = training.apply(lambda x: 2 if x['function']=='Affaire' else x['function'], axis=1)

In [4]:
training

Unnamed: 0,unittitle,function
0,"Adrien, Veuve Timothée",0
1,"Aigron, Demoiselle",0
2,"Albert, Jean",0
3,"Aldin, Jean (alias Alding)",0
4,"Alizeau, François",0
...,...,...
10873,Compagnie de la mer du Sud,2
10874,Croix de vie,2
10875,Invalides,2
10876,"Oléron, ile d’",2


In [5]:
training.shape

(10878, 2)

In [6]:
training.drop_duplicates(inplace=True)

In [7]:
training.isnull().sum()

unittitle    0
function     0
dtype: int64

In [8]:
training.shape

(10561, 2)

In [9]:
def process_text(text):
    #Remove punction
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    #Puts words in a list
    word_list = [word for word in nopunc.split()]
    
    return word_list
    
    

In [10]:
training['unittitle'].head().apply(process_text)

0       [Adrien, Veuve, Timothée]
1            [Aigron, Demoiselle]
2                  [Albert, Jean]
3    [Aldin, Jean, alias, Alding]
4             [Alizeau, François]
Name: unittitle, dtype: object

In [11]:
bow = CountVectorizer(analyzer=process_text).fit_transform(training['unittitle'])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(bow, training['function'], test_size=0.20, random_state=0)

In [14]:
bow.shape

(10561, 8458)

In [16]:
classifier = MultinomialNB().fit(X_train, y_train)

In [29]:
print(classifier.predict(X_train))
print(y_train.values)

[0 1 0 ... 0 1 0]
[0 1 1 ... 0 1 0]


In [30]:
pred = classifier.predict(X_train)
print(classification_report(y_train, pred))
print('Confusion Matrix: \n', confusion_matrix(y_train,pred))
print('Accuracy: ', accuracy_score(y_train, pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      7660
           1       0.98      0.74      0.85       575
           2       1.00      0.14      0.25       213

    accuracy                           0.96      8448
   macro avg       0.98      0.63      0.69      8448
weighted avg       0.96      0.96      0.95      8448

Confusion Matrix: 
 [[7657    3    0]
 [ 147  428    0]
 [ 179    4   30]]
Accuracy:  0.9605823863636364


In [31]:
pred = classifier.predict(X_test)
print(classification_report(y_test, pred))
print('Confusion Matrix: \n', confusion_matrix(y_test,pred))
print('Accuracy: ', accuracy_score(y_test, pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1903
           1       0.99      0.72      0.83       166
           2       1.00      0.05      0.09        44

    accuracy                           0.96      2113
   macro avg       0.98      0.59      0.63      2113
weighted avg       0.96      0.96      0.95      2113

Confusion Matrix: 
 [[1902    1    0]
 [  47  119    0]
 [  42    0    2]]
Accuracy:  0.9574065309985802
