In [218]:
%matplotlib inline
import pandas as pd
from pandas import DataFrame
import functools
import glob, os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

In [179]:
SIData_dir = '/Users/thuong/Documents/SI_Data'
df_arr = []
for f in glob.glob(os.path.join(SIData_dir, '*.csv')):
    df_arr.append(pd.read_csv(f, encoding='latin1'))
df = functools.reduce(lambda left, right: pd.merge(left, right, on=['BKG_NO']), df_arr)

In [180]:
df.replace({'#': ' '}, regex=True, inplace=True)
df.replace({'\$': ' '}, regex=True, inplace=True)
df.head(5)

Unnamed: 0,BKG_NO,ALSO_NOTIFY,CONSIGNEE,POR,POL,POD,DEL,VESSEL,MARK,DESCRIPTION,NOTIFY,SHIPPER
0,AAR402333700,,TO ORDER,AARHUS PORT,ROTTERDAM,BEIJIAO CHINA,BEIJIAO CHINA,HAMMERSMITH BRIDGE 0029E,,FF WHITE FISH MEAL DANISH WHITE FISHMEAL (F...,FUZHOU PIN GUAN TRADE CO. LTD. 1720RM 17/F. ...,FF SKAGEN A/S HAVNEVAGTVEJ 5 9990 SKAGEN DE...
1,AAR402376700,,BIOSEVE 5 AVENUE LA PRINCESSE LALLA MERYEM ...,FREDERICIA,HAMBURG,CASABLANCA,CASABLANCA,HANJIN EUROPE 0013E,,1195 CLL FISH FEED SHIPPED ON BOARD HANJIN ...,SAME AS CONSIGNEE,ALLER AQUA A/S ALLERVEJ 130 DK-6070 CHRISTIA...
2,AAR402394700,,TO ORDER,FREDERICIA,HAMBURG,PORT KLANG,PORT KLANG,CSCL GLOBE 0002E,,"DANISH FISHMEAL ""999 PRIME QUALITY"" FREIGHT...",T.B.M. SDN.BHD. 10H-1 JALAN TUN ABDUL RAZAK ...,NORSILDMEL AS KJERREIDVIKEN 16 5141 FYLLINGS...
3,AAR402402400,* PHONE: +862258783500 FAX: +862258783555,DSV AIR & SEA CO. LTD SUITE 1402 NINGTAI PL...,FREDERICIA,HAMBURG,XINGANG,XINGANG,COSCO PORTUGAL 0003E,7566612656.0,WINDMILL PARTS,ALL INVOICE COPIES DSV AIR & SEA CO. LTD SU...,DSV AIR & SEA A/S I101 NOKIAVEJ 30 8700 HORS...
4,AAR402395100,** TEL.: (886) 2 2507 7071 EXT. 223,TO ORDER,AARHUS,HAMBURG,KAOHSIUNG,KAOHSIUNG,HUMEN BRIDGE 0047E,,810 BAGS DANISH FISHMEAL 999-LT SHIPPED ON...,CHAROEN POKPHAND ENTERPRISE (TAIWAN) CO. LTD....,NORSILDMEL AS KJERREIDVIKEN 16 5141 FYLLINGS...


In [181]:
from sklearn.feature_extraction import text
stop_words = set(text.ENGLISH_STOP_WORDS)

In [182]:
def text_cleaning(text):
    words = []
    words.extend(w for w in str(text).split() 
                    if w.isalpha() and len(w) != 1 and w.lower() != 'nan' 
                                    and w.lower() not in stop_words)
    return ' '.join(words)

In [230]:
df_data = DataFrame(columns=['Value', 'Class'])
class_names = []
for col in df:
    if col != 'BKG_NO':
        df[col] = df[col].apply(text_cleaning)
        tmp_df = DataFrame(columns=['Value', 'Class'])
        tmp_df['Value'] = df[col]
        tmp_df['Class'] = col
        df_data = df_data.append(tmp_df)
        class_names.append(col)
    else:
        df[col] = df[col]
df_data = df_data[df_data['Value'] != ""]
df_data = df_data.reset_index(drop=True)

Unnamed: 0,Value,Class
0,EXT,ALSO_NOTIFY
1,SELANGOR MALAYSIA,ALSO_NOTIFY
2,FAX,ALSO_NOTIFY
3,FAX,ALSO_NOTIFY
4,TEL FAX,ALSO_NOTIFY
5,COM,ALSO_NOTIFY
6,NOWACO APS PRINSENSGADE BOX AALBORG DENMARK,ALSO_NOTIFY
7,NOWACO SHIPPING APS PRINSENSGADE AALBORG DENMARK,ALSO_NOTIFY
8,PHONE,ALSO_NOTIFY
9,SELANGOR MALAYSIA,ALSO_NOTIFY


In [231]:
df_data = df_data.reindex(np.random.permutation(df_data.index))

In [232]:
TRAIN_SIZE = 0.8
size = int(len(df_data) * TRAIN_SIZE)
df_train, df_test = df_data.iloc[:size], df_data.iloc[size:]

In [238]:
total_amount = [len(df_data[df_data['Class'] == c]) for c in class_names]
train_amount = [len(df_train[df_train['Class'] == c]) for c in class_names]
test_amount = [len(df_test[df_test['Class'] == c]) for c in class_names]
tmp_arr = np.array([total_amount, train_amount, test_amount])
print(DataFrame(tmp_arr, ['Total', 'Train', 'Test'], class_names))

       ALSO_NOTIFY  CONSIGNEE     POR     POL     POD     DEL  VESSEL   MARK  \
Total        25312     102672  102232  102388  100899  100900  102737  44181   
Train        20211      82173   81769   81824   80724   80626   82266  35346   
Test          5101      20499   20463   20564   20175   20274   20471   8835   

       DESCRIPTION  NOTIFY  SHIPPER  
Total       102645  101428   102731  
Train        82092   81249    82220  
Test         20553   20179    20511  


In [219]:
pipeline = Pipeline([
                    ('vectorizer', CountVectorizer()),
                    ('tfidf_transformer',  TfidfTransformer()),
                    ('classifier', LinearSVC())])

train_data, test_data = df_train['Value'].values, df_test['Value'].values
train_target, test_target = df_train['Class'].values, df_test['Class'].values

pipeline.fit(train_data, train_target)
predictions = pipeline.predict(test_data)

cnf_matrix = confusion_matrix(test_target, predictions)
print('Confusion matrix with one-fold: ')
print(cnf_matrix)
print("Score with one-fold: %s" % precision_score(test_target, predictions, average = 'weighted'))
print("Score with one-fold: %s" % precision_score(test_target, predictions, average = None))

Confusion matrix with one-fold: 
[[ 4004   283    50    32    63   269    98    32    18   148     8]
 [  106 14381    12    21    68  4828    13     2     6   962     2]
 [   35     5  4621     1    10     5 11856  1982  1703    12     1]
 [   29    13     3 20268   140    12     2     0     0    12     0]
 [  224   199    37   271  7655   113   110    27    27   107     6]
 [  182  6601     9    15    77 12821    11     1     4   690     4]
 [   34     2  1820     0    11     0 14375  3741   230     1     2]
 [    4     2    44     0     5     1  1358 17164  1812     0     0]
 [   41     1   286     1     5     3  1031  9875  9220    13     1]
 [   57   478    24    14    46   281     2     2    10 19732     1]
 [   28     0     3     0     3     2    15     2     2     2 20531]]
Score with one-fold: 0.750892781278
Score with one-fold: [ 0.84401349  0.65472342  0.66883775  0.98278621  0.94704936  0.6992637
  0.49790447  0.52284635  0.70748926  0.91018958  0.99878381]
