## Step6: Training Model based on training and validation set.

Random Forest Classifier generally does not require CV so combine train validation set.

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.externals import joblib
import datetime as dt
import time
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
#load data. no need for CV so just training data.
x_trainval = pd.read_csv('x_trainval.csv') # Training and validation dataset.
y_trainval = pd.read_csv('y_trainval.csv')
print(x_trainval.head(3))
print(y_trainval.head(3))
print(x_trainval.shape)
print(y_trainval.shape)

        Cnid                         Description
0  CN3339767  development data management system
1  CN3419927                    software upgrade
2  CN3378926        specialist technical advisor
        Cnid  yesIT
0  CN3339767      1
1  CN3419927      1
2  CN3378926      1
(89120, 2)
(89120, 2)


In [3]:
# then we turn cnid column into index so that x and y would match. Maybe not necessary?
x_trainval.index = x_trainval['Cnid']
x_trainval.drop(['Cnid'],axis=1,inplace=True)
y_trainval.index = y_trainval['Cnid']
y_trainval.drop(['Cnid'],axis=1,inplace=True)

In [4]:
print(x_trainval.shape)
print(y_trainval.shape)

(89120, 1)
(89120, 1)


In [5]:
y_trainval.index

Index(['CN3339767', 'CN3419927', 'CN3378926', 'CN3432035', 'CN3295947',
       'CN3396801', 'CN2979762', 'CN3421256', 'CN3330935', 'CN3318967-A1',
       ...
       'CN3457863', 'CN3385378', 'CN3428351', 'CN3445544', 'CN3028072',
       'CN3320700', 'CN3440890', 'CN3369479-A1', 'CN3212192', 'CN3304727'],
      dtype='object', name='Cnid', length=89120)

In [6]:
x_trainval.Description.fillna(value="NoDesc",inplace=True)

In [7]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word',ngram_range=(1,2))
tfidf = tfidf_vectorizer.fit_transform(x_trainval["Description"])

In [8]:
print(type(tfidf))
print(tfidf)
print(tfidf_vectorizer.get_feature_names()[:99])

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 20362)	0.28173423084
  (0, 18063)	0.290783226673
  (0, 42647)	0.237070766278
  (0, 74235)	0.249500151534
  (0, 20434)	0.599921351426
  (0, 18215)	0.466917255185
  (0, 42914)	0.373765586073
  (1, 68901)	0.353904836993
  (1, 79186)	0.546761286164
  (1, 69166)	0.758817146818
['aa', 'aa accessibility', 'aa compliant', 'aa installation', 'aa redesigned', 'aa rp', 'aa size', 'aa spare', 'aaa', 'aaa industry', 'aab', 'aab pull', 'aabd', 'aacap', 'aacap camp', 'aacqa', 'aacqa victoria', 'aad', 'aad aerial', 'aad ageing', 'aad kingston', 'aad maximo', 'aad record', 'aad share', 'aaf', 'aaf av', 'aaf publication', 'aaf registration', 'aafc', 'aafc sxpl', 'aafcans', 'aage', 'aage conference', 'aagis', 'aagis grain', 'aahl', 'aahl victoria', 'aahu', 'aahu grant', 'aahu text', 'aams', 'aams lawyer', 'aams probity', 'aams security', 'aapa', 'aapa certificate', 'aapg', 'aapg seg', 'aapt', 'aarnet', 'aarnet internet', 'aarnet wireless', 'aaron', 'aaron davis

In [9]:
# optiplex 8 core processor
clf = RandomForestClassifier(n_estimators=800,criterion='gini',n_jobs =-1,random_state =1)

start_time = time.time()
print("training...."); print("start time:" + str(dt.datetime.now().time()))

clf.fit(tfidf,y_trainval)
print("Training completed."); print("end time:" + str(dt.datetime.now().time()))
print("Time Spent."); print(time.time() - start_time)

training....
start time:11:17:27.908918


  import sys


Training completed.
end time:11:52:19.132521
Time Spent.
2091.2236034870148


In [10]:
# save the model to pickle
joblib.dump(clf,'RandomForestv2.pkl')

['RandomForestv2.pkl']

## Figuring out the discrepencies in the existing training set.

In [18]:
# measure new dataset. 
y_train_fit = clf.predict(tfidf) # the transformed matrix to fit.

In [19]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [20]:
# fit data set and get accuracy, confusion matrix.
# firstly need to tfidf transform test data set into sparse matrix.
len(y_train_fit)

89120

In [21]:
print(classification_report(y_train_fit,y_trainval))

             precision    recall  f1-score   support

          0       0.97      0.96      0.97     45858
          1       0.96      0.97      0.96     43262

avg / total       0.97      0.97      0.97     89120



## Transforming df_test to compare results.

    need to define a funtion to do the steps of stemming and stopword removal and reconstructing as a input.

Since df_test is not lemmatised, we need to do Step2-4 in CH2.

# #not finished

In [11]:
# the Target DataFrame need to have "Description" column. And 'CNID' column.
def Cleanstep234(df):
    # replace 'IT' with 'infotech'
    IT_regex = r'(?<=(\b))IT(?=(\b))'
    df.Description = df.Description.str.replace(IT_regex, 'infotech',case=True)
    df.index = df.Cnid
    df.drop(['Cnid'],axis =1,inplace=True)
    ###########################################################################################