In [1]:
import numpy as np 
import csv
import os
import re
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import pandas as pd
import spacy
from spacy import displacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.chunk import conlltags2tree, tree2conlltags
import altair as alt
# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics


from sklearn.metrics import classification_report

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *


from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE



In [2]:
path = '/home/sunitc/work'
dfs = []
for filename in os.listdir(path):
    if '.csv' in filename:
        one_file_df = pd.read_csv(path +'/'+filename)
        dfs.append(one_file_df)
    
raw_df = pd.concat(dfs, axis=0, ignore_index=True)
raw_df.drop("Unnamed: 0", axis=1, inplace=True)
raw_df.head()

Unnamed: 0,doc_type,doc_text
0,Memo,To: FROM: SUBJECT: According Materials in Be...
1,Form,7002 00 OF (2 DOCUMENT CLEARANCE SHEET P (For...
2,Functional specification,‘CABARRUS COUNTY EFFECTIVE DATE MARLBORO 100 R...
3,Budget,4 LEO BURNETT U.S.A. : ADVERTISING . NEWSP...
4,Handwritten Note,| 2 OFF This mest oe 4 (okel 2.000 FF “u my am...


In [3]:
raw_df.describe()

Unnamed: 0,doc_type,doc_text
count,301668,301668
unique,24,299219
top,Resume,IMAGE NOT AVAILABLE ONLINE The material refe...
freq,23359,244


In [4]:
raw_df.doc_type.unique()

array(['Memo', 'Form', 'Functional specification', 'Budget',
       'Handwritten Note', 'Resume', 'Invoice', 'Questionnaire', 'Letter',
       'E-mail', 'News Article', 'Advertisement', 'Presentation',
       'scientific report', 'scientific publication', 'File Folder',
       'News release', 'Taxes', 'Agreements', 'Human Resources', 'SOW',
       'Financial Statements', 'Annual Report', 'Product Documentation'],
      dtype=object)

In [5]:
doc_type_df = pd.DataFrame(raw_df.doc_type.value_counts())
doc_type_df.columns = ['count']
doc_type_df['doc_type'] = doc_type_df.index
doc_type_df = doc_type_df[['doc_type','count']]
doc_type_df = doc_type_df.reset_index(drop=True)
doc_type_df

Unnamed: 0,doc_type,count
0,Resume,23359
1,E-mail,19878
2,Letter,19849
3,Functional specification,19845
4,Form,19693
5,Invoice,19676
6,News Article,19641
7,Memo,19637
8,scientific publication,19633
9,scientific report,19586


In [7]:
raw_df.describe()
raw_df.head(5)

Unnamed: 0,doc_type,doc_text
0,Memo,To: FROM: SUBJECT: According Materials in Be...
1,Form,7002 00 OF (2 DOCUMENT CLEARANCE SHEET P (For...
2,Functional specification,‘CABARRUS COUNTY EFFECTIVE DATE MARLBORO 100 R...
3,Budget,4 LEO BURNETT U.S.A. : ADVERTISING . NEWSP...
4,Handwritten Note,| 2 OFF This mest oe 4 (okel 2.000 FF “u my am...


In [8]:
lst_doc_types = ['Resume',
                 'E-mail',
                 'Letter',
                 'Functional specification',
                 'Form',
                 'Invoice',
                 'News Article',
                 'Memo',
                 'scientific publication',
                 'scientific report',
                 'Questionnaire',
                 'Budget',
                 'Presentation',
                 'Handwritten Note',
                 'Advertisement']

final_df = raw_df[raw_df['doc_type'].isin(lst_doc_types)]
final_df

Unnamed: 0,doc_type,doc_text
0,Memo,To: FROM: SUBJECT: According Materials in Be...
1,Form,7002 00 OF (2 DOCUMENT CLEARANCE SHEET P (For...
2,Functional specification,‘CABARRUS COUNTY EFFECTIVE DATE MARLBORO 100 R...
3,Budget,4 LEO BURNETT U.S.A. : ADVERTISING . NEWSP...
4,Handwritten Note,| 2 OFF This mest oe 4 (okel 2.000 FF “u my am...
...,...,...
301663,Resume,RANK: SGT/E-5 NON- COMMISSIONED OFFIC...
301664,Resume,"GOVERNMENT RELATIONS, COMMUNICATIONS ..."
301665,Resume,GEEK SQUAD AGENT Professional...
301666,Resume,PROGRAM DIRECTOR / OFFICE MANAGER ...


In [9]:
final_df.doc_type.value_counts()

Resume                      23359
E-mail                      19878
Letter                      19849
Functional specification    19845
Form                        19693
Invoice                     19676
News Article                19641
Memo                        19637
scientific publication      19633
scientific report           19586
Questionnaire               19488
Budget                      19455
Presentation                19391
Handwritten Note            16590
Advertisement               15643
Name: doc_type, dtype: int64

In [10]:
# limit the number of rows for each doc_type to number below.  
final_df = final_df.groupby('doc_type').head(5000)
final_df

Unnamed: 0,doc_type,doc_text
0,Memo,To: FROM: SUBJECT: According Materials in Be...
1,Form,7002 00 OF (2 DOCUMENT CLEARANCE SHEET P (For...
2,Functional specification,‘CABARRUS COUNTY EFFECTIVE DATE MARLBORO 100 R...
3,Budget,4 LEO BURNETT U.S.A. : ADVERTISING . NEWSP...
4,Handwritten Note,| 2 OFF This mest oe 4 (okel 2.000 FF “u my am...
...,...,...
9170,Advertisement,is dhve to mcense crete tare and Ute prot snci...
9191,Advertisement,fT ft J 660099407
9201,Advertisement,Philip creates a program sales to At Philip ...
9202,Advertisement,TS NECKLACE } Wa qa aA me VIRGINTA SLIMS: ...


In [11]:
X = final_df['doc_text']
y = final_df['doc_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

In [12]:
X_train.head(5)


1365          It\ 1 kro Mijpoae sires 9196 QSL LL OUSS...
2440    ; fe 2 1B ISE  Cortahcer, BLP  oe Thanch 194  ...
4240    make checks payable to:  ‘Account With  Reuben...
1719       , OUTDOOR BULLETIN =~                . DISP...
2435    PRIORITY  Ra INBIFO Institut for biologische F...
Name: doc_text, dtype: object

In [13]:
y_train.head(5)

1365     Handwritten Note
2440     Handwritten Note
4240               Budget
1719                 Form
2435    scientific report
Name: doc_type, dtype: object

In [15]:
print("----KNN----")
#k_vals = {'n_neighbors': np.arange(1,45)}
#knn = KNeighborsClassifier()
#knn_gs = GridSearchCV(estimator=knn,param_grid=k_vals,cv=5)
#knn_gs.fit(cnt_v_train,y_train)
#optimum_k = knn_gs.best_params_['n_neighbors']
#print("The optimum K-Value=", optimum_k)

----KNN----


In [16]:
lst_results = []

In [17]:
def call_knn(cv_train,y_train,cv_test,y_test,k_val):
    knn1 = KNeighborsClassifier(n_neighbors=k_val)
    knn1.fit(cv_train,y_train)
    predict1 = knn1.predict(cv_test)
    f1score = metrics.f1_score(y_test,predict1,average="weighted")
    accuracy_score = metrics.accuracy_score(y_test,predict1)
    print("KNN")
    print("F1-Score ",f1score)
    print("Accuracy=",accuracy_score)
    print(25*"-")    

In [18]:
def call_MNB(cv_train,y_train,cv_test,y_test):
    mnb1 = MultinomialNB(alpha=0.01)
    mnb1.fit(cv_train,y_train)
    mnb_predict = mnb1.predict(cv_test)
    mnb_f1score = metrics.f1_score(y_test,mnb_predict,average="weighted")
    accuracy_score = metrics.accuracy_score(y_test,mnb_predict)
    print("MultinomialNB")    
    print("F1-Score ",mnb_f1score)
    print("Accuracy=",accuracy_score)
    print(25*"-")    

In [19]:
def call_LogReg(cv_train,y_train,cv_test,y_test,c_val):
    #c= 1.1
    lg1 = LogisticRegression(C=c_val, penalty = "l2",max_iter=1000,solver="liblinear", multi_class="auto")
    lg1.fit(cv_train,y_train)
    predict = lg1.predict(cv_test)
    f1lgscoreLR = metrics.f1_score(y_test,predict,average="weighted")
    accuracy_score = metrics.accuracy_score(y_test,predict)
    sum_weight = np.sum(lg1.coef_**2, axis=1) 
    print("Logistic Regressions")
    print("F1-Score ",f1lgscoreLR)
    print("Accuracy=",accuracy_score)
    print(25*"-")    

In [20]:
def call_svc(cv_train,y_train,cv_test,y_test):
    sc = StandardScaler(with_mean=False)
    sc.fit(cv_train)
    X_train_std = sc.transform(cv_train)
    X_test_std = sc.transform(cv_test)
    # Instantiate the Support Vector Classifier (SVC)
    svc = SVC(C=1.0, random_state=1, kernel='linear')
    # Fit the model
    svc.fit(X_train_std, y_train)
    # Make the predictions
    y_predict = svc.predict(X_test_std)
    # Measure the performance
    f1score = metrics.f1_score(y_test, y_predict,average="weighted")
    accuracy_score = metrics.accuracy_score(y_test, y_predict)
    print("SVC")
    print("F1-Score ",f1score)
    print("Accuracy=",accuracy_score)
    print(25*"-")

In [21]:
#c_vals = {'C' : np.arange(0.1,1,0.1)}    
#lg = LogisticRegression(penalty = "l2",max_iter=1000,solver="liblinear", multi_class="auto")
#lg_gs = GridSearchCV(estimator=lg,param_grid=c_vals,cv=5)
#lg_gs.fit(cv_train,y_train)
#optimum_C = lg_gs.best_params_['C']
#print ("Optimum C using GridSearchCV :", optimum_C)  

In [22]:
def doc_text_preprocessor(s):    
    s = s.lower()
    #s = re.sub(r'<.*>',r' ',s) # remove <text here> 
    #s = re.sub(r'(\w*[0-9]\w+|\w+[0-9]\w*)', r'', s) # remove words with digits
    s = re.sub("[^A-Za-z0-9 \\n]", "", s) # remove everything other than a-z and 0-9 
    s = re.sub("([\d]+)", " NUM ", s) # replace digits only with NUM
    s = re.sub("(\w{%d})\w+" % 6, "\\1", s) # limit word size to 6. Discovered 6 by experimenting with sizes 2-10.  
    return s

In [23]:
cnt_v = CountVectorizer()
cv_train = cnt_v.fit_transform(X_train)
cv_test = cnt_v.transform(X_test)


tf_v = TfidfVectorizer()
tfv_train = tf_v.fit_transform(X_train)
tfv_test = tf_v.transform(X_test)

print(35*"-")
print("No Pre-Processor")
print(35*"-")
print("CountVectorize")
call_knn(cv_train,y_train,cv_test,y_test,65)
print("TfidfVectorizer")
call_knn(tfv_train,y_train,tfv_test,y_test,65)

print("CountVectorize")
call_MNB(cv_train,y_train,cv_test,y_test)
print("TfidfVectorizer")
call_MNB(tfv_train,y_train,tfv_test,y_test)

print("CountVectorize")
call_LogReg(cv_train,y_train,cv_test,y_test,1.2)
print("TfidfVectorizer")
call_LogReg(tfv_train,y_train,tfv_test,y_test,1.2)

print("CountVectorize")
call_svc(cv_train,y_train,cv_test,y_test)
print("TfidfVectorizer")
call_svc(tfv_train,y_train,tfv_test,y_test)



-----------------------------------
No Pre-Processor
-----------------------------------
CountVectorize
KNN
F1-Score  0.30328767554809904
Accuracy= 0.29
-------------------------
TfidfVectorizer
KNN
F1-Score  0.5676678406013104
Accuracy= 0.572
-------------------------
CountVectorize
MultinomialNB
F1-Score  0.6599496903398773
Accuracy= 0.6626666666666666
-------------------------
TfidfVectorizer
MultinomialNB
F1-Score  0.666572123578661
Accuracy= 0.67
-------------------------
CountVectorize
Logistic Regressions
F1-Score  0.7100654263550047
Accuracy= 0.708
-------------------------
TfidfVectorizer
Logistic Regressions
F1-Score  0.7031238730862606
Accuracy= 0.7026666666666667
-------------------------
CountVectorize
SVC
F1-Score  0.37690366163504785
Accuracy= 0.3433333333333333
-------------------------
TfidfVectorizer
SVC
F1-Score  0.5141050382040246
Accuracy= 0.45466666666666666
-------------------------


In [24]:
cnt_v = CountVectorizer(preprocessor=doc_text_preprocessor)
cv_train = cnt_v.fit_transform(X_train)
cv_test = cnt_v.transform(X_test)


tf_v = TfidfVectorizer(preprocessor=doc_text_preprocessor)
tfv_train = tf_v.fit_transform(X_train)
tfv_test = tf_v.transform(X_test)

print(35*"-")
print("Custom Pre-Processor")
print(35*"-")
print("CountVectorize")
call_knn(cv_train,y_train,cv_test,y_test,65)
print("TfidfVectorizer")
call_knn(tfv_train,y_train,tfv_test,y_test,65)

print("CountVectorize")
call_MNB(cv_train,y_train,cv_test,y_test)
print("TfidfVectorizer")
call_MNB(tfv_train,y_train,tfv_test,y_test)

print("CountVectorize")
call_LogReg(cv_train,y_train,cv_test,y_test,1.2)
print("TfidfVectorizer")
call_LogReg(tfv_train,y_train,tfv_test,y_test,1.2)

print("CountVectorize")
call_svc(cv_train,y_train,cv_test,y_test)
print("TfidfVectorizer")
call_svc(tfv_train,y_train,tfv_test,y_test)


-----------------------------------
Custom Pre-Processor
-----------------------------------
CountVectorize
KNN
F1-Score  0.3592217975044399
Accuracy= 0.35
-------------------------
TfidfVectorizer
KNN
F1-Score  0.3958725297675388
Accuracy= 0.39266666666666666
-------------------------
CountVectorize
MultinomialNB
F1-Score  0.6539624023303388
Accuracy= 0.6573333333333333
-------------------------
TfidfVectorizer
MultinomialNB
F1-Score  0.6637724965436927
Accuracy= 0.6653333333333333
-------------------------
CountVectorize
Logistic Regressions
F1-Score  0.7096158186306646
Accuracy= 0.708
-------------------------
TfidfVectorizer
Logistic Regressions
F1-Score  0.6907137641606641
Accuracy= 0.6926666666666667
-------------------------
CountVectorize
SVC
F1-Score  0.370392744287821
Accuracy= 0.35733333333333334
-------------------------
TfidfVectorizer
SVC
F1-Score  0.5492990071603449
Accuracy= 0.526
-------------------------


In [None]:
# adding ngram and stop words

cnt_v = CountVectorizer(ngram_range = (1, 5), stop_words='english')
cv_train = cnt_v.fit_transform(X_train)
cv_test = cnt_v.transform(X_test)


tf_v = TfidfVectorizer(ngram_range = (1, 5), stop_words='english')
tfv_train = tf_v.fit_transform(X_train)
tfv_test = tf_v.transform(X_test)

print(35*"-")
print("No Pre-Processor - ngram(1,5) & stopwords ")
print(35*"-")
print("CountVectorize")
call_knn(cv_train,y_train,cv_test,y_test,65)
print("TfidfVectorizer")
call_knn(tfv_train,y_train,tfv_test,y_test,65)

print("CountVectorize")
call_MNB(cv_train,y_train,cv_test,y_test)
print("TfidfVectorizer")
call_MNB(tfv_train,y_train,tfv_test,y_test)

print("CountVectorize")
call_LogReg(cv_train,y_train,cv_test,y_test,1.2)
print("TfidfVectorizer")
call_LogReg(tfv_train,y_train,tfv_test,y_test,1.2)

print("CountVectorize")
call_svc(cv_train,y_train,cv_test,y_test)
print("TfidfVectorizer")
call_svc(tfv_train,y_train,tfv_test,y_test)



-----------------------------------
No Pre-Processor - ngram(1,5) & stopwords 
-----------------------------------
CountVectorize
KNN
F1-Score  0.022553681501049926
Accuracy= 0.08666666666666667
-------------------------
TfidfVectorizer
KNN
F1-Score  0.4562024949278127
Accuracy= 0.44
-------------------------
CountVectorize
MultinomialNB
F1-Score  0.5783587116000043
Accuracy= 0.5906666666666667
-------------------------
TfidfVectorizer
MultinomialNB
F1-Score  0.5756158664733498
Accuracy= 0.5906666666666667
-------------------------
CountVectorize
Logistic Regressions
F1-Score  0.4633515344265212
Accuracy= 0.424
-------------------------
TfidfVectorizer
Logistic Regressions
F1-Score  0.5509769462388052
Accuracy= 0.5626666666666666
-------------------------
CountVectorize


In [25]:
# seperated out the doc_types in dataframe for EDA.  

Resume_df= raw_df[raw_df['doc_type'] == 'Resume']
Email_df= raw_df[raw_df['doc_type'] == 'E-mail']
Letter_df= raw_df[raw_df['doc_type'] == 'Letter']
Functional_specification_df= raw_df[raw_df['doc_type'] == 'Functional specification']
Form_df= raw_df[raw_df['doc_type'] == 'Form']
Invoice_df= raw_df[raw_df['doc_type'] == 'Invoice']
News_Article_df= raw_df[raw_df['doc_type'] == 'News Article']
Memo_df= raw_df[raw_df['doc_type'] == 'Memo']
scientific_publication_df= raw_df[raw_df['doc_type'] == 'scientific publication']
scientific_report_df= raw_df[raw_df['doc_type'] == 'scientific report']
Questionnaire_df= raw_df[raw_df['doc_type'] == 'Questionnaire']
Budget_df= raw_df[raw_df['doc_type'] == 'Budget']
Presentation_df= raw_df[raw_df['doc_type'] == 'Presentation']
Handwritten_Note_df= raw_df[raw_df['doc_type'] == 'Handwritten Note']
Advertisement_df= raw_df[raw_df['doc_type'] == 'Advertisement']
File_Folder_df= raw_df[raw_df['doc_type'] == 'File Folder']
SOW_df= raw_df[raw_df['doc_type'] == 'SOW']
Taxes_df= raw_df[raw_df['doc_type'] == 'Taxes']
Human_Resources_df= raw_df[raw_df['doc_type'] == 'Human Resources']
Annual_Report_df= raw_df[raw_df['doc_type'] == 'Annual Report']
Financial_Statements_df= raw_df[raw_df['doc_type'] == 'Financial Statements']
Agreements_df= raw_df[raw_df['doc_type'] == 'Agreements']
News_release_df= raw_df[raw_df['doc_type'] == 'News release']
Product_Documentation_df= raw_df[raw_df['doc_type'] == 'Product Documentation']
