In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy as sp
from nltk.corpus import stopwords
import re
from textblob import Word
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn_pandas import DataFrameMapper
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from textblob import TextBlob

In [3]:
data_path = './data'

In [4]:
raw_data_test = pd.read_excel(data_path + '/Data_Test.xlsx')
raw_data_train = pd.read_excel(data_path + '/Data_Train.xlsx')

In [7]:
raw_data_test.shape

(2748, 1)

In [6]:
raw_data_train.shape

(7628, 2)

In [8]:
raw_data_train['count'] = raw_data_train.groupby('STORY')['STORY'].transform(np.size)

In [11]:
raw_data_train = raw_data_train.loc[raw_data_train['count'] == 1]

In [12]:
split = raw_data_train.shape[0]

In [13]:
split

7499

In [15]:
data_under_processing = pd.concat([raw_data_train,raw_data_test],sort=False)

In [16]:
processed_data['STORY'] = data_under_processing['STORY']

In [17]:
processed_data['word_count'] = data_under_processing['STORY'].apply(lambda x: len(x.split(' ')))

In [20]:
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))
processed_data['avg_word_length'] = processed_data['STORY'].apply(lambda x: avg_word(x))

In [21]:
processed_data['char_count'] = processed_data['STORY'].apply(len)
processed_data['word_density'] = processed_data['char_count'] / (processed_data['word_count']+1)

In [22]:
processed_data['STORY'] = processed_data['STORY'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [23]:
processed_data['STORY'] = processed_data['STORY'].replace('[^\w\s]','')

In [24]:
stop = stopwords.words('english')

In [25]:
processed_data['STORY'] = processed_data['STORY'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [26]:
processed_data['STORY'] = processed_data['STORY'].apply(lambda x: re.sub('[^a-zA-Z \n\.]', ' ', x))
processed_data['STORY'] = processed_data['STORY'].apply(lambda x: re.sub('[.]', ' ', x))

In [27]:
processed_data['STORY_LEM'] = processed_data['STORY'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [29]:
processed_data['STORY_LEM2'] = processed_data['STORY_LEM'].apply(lambda x: " ".join(x for x in x.split() if len(x) > 2))

In [79]:
rare_freq = pd.Series(' '.join(processed_data['STORY_LEM2']).split()).value_counts()[-10000:]
rare_freq = list(rare_freq.index)
processed_data['STORY_LEM2'] = processed_data['STORY_LEM2'].apply(lambda x: " ".join(x for x in x.split() if x not in rare_freq))

In [110]:
def specific_pos_tag(x, flag):
    string = ''
    try:
        wiki = TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                string = string + ' ' + list(tup)[0] 
    except:
        pass
    return string

In [109]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

In [111]:
processed_data['noun_String'] = processed_data['STORY_LEM2'].apply(lambda x: specific_pos_tag(x, 'noun'))

In [138]:
processed_data['adj_String'] = processed_data['STORY_LEM2'].apply(lambda x: specific_pos_tag(x, 'adj'))
processed_data['adv_String'] = processed_data['STORY_LEM2'].apply(lambda x: specific_pos_tag(x, 'adv'))

In [220]:
rare_freq = pd.Series(' '.join(processed_data['adj_String']).split()).value_counts()[-3400:]
rare_freq = list(rare_freq.index)
processed_data['adj_String'] = processed_data['adj_String'].apply(lambda x: " ".join(x for x in x.split() if x not in rare_freq))
rare_freq = pd.Series(' '.join(processed_data['adv_String']).split()).value_counts()[-1000:]
rare_freq = list(rare_freq.index)
processed_data['adv_String'] = processed_data['adv_String'].apply(lambda x: " ".join(x for x in x.split() if x not in rare_freq))

In [226]:
rare_freq = pd.Series(' '.join(processed_data['noun_String']).split()).value_counts()[-2800:]
rare_freq = list(rare_freq.index)
processed_data['noun_String'] = processed_data['noun_String'].apply(lambda x: " ".join(x for x in x.split() if x not in rare_freq))

In [227]:
processed_data['noun_adj_adv_string'] = processed_data['noun_String'] + ' ' + processed_data['adj_String'] + ' ' + processed_data['adv_String'] 

In [228]:
processed_data['noun_adj_string'] = processed_data['noun_String'] + ' ' + processed_data['adj_String'] 

In [80]:
processed_data.to_csv(data_path + '/process_data3.csv')

In [140]:
ready_data = processed_data
# ready_data['SECTION'] = processed_data['SECTION']

In [113]:
# ready_data = ready_data.drop(columns=['STORY','count','STORY_LEM'])
ready_data.head()

Unnamed: 0,SECTION,word_count,avg_word_length,char_count,word_density,STORY_LEM2,noun_String
0,3.0,146,4,843,5.734694,painful huge reversal fee income among private...,reversal income sector bank fee loan deal boo...
1,0.0,17,6,129,7.166667,formidable opposition alliance among congress ...,opposition alliance jharkhand mukti jharkhand...
2,3.0,57,5,386,6.655172,asian currency trading lower today south korea...,currency today china china ringgit rupiah dol...
3,1.0,101,4,587,5.754902,want answer question click answer clicking ans...,question click answer answer proceed answer v...
4,3.0,46,5,299,6.361702,global market gold price edged today disappoin...,market gold price today factory activity data...


In [83]:
ready_data.head()

Unnamed: 0,SECTION,word_count,avg_word_length,char_count,word_density,STORY_LEM2
0,3.0,146,4,843,5.734694,painful huge reversal fee income among private...
1,0.0,17,6,129,7.166667,formidable opposition alliance among congress ...
2,3.0,57,5,386,6.655172,asian currency trading lower today south korea...
3,1.0,101,4,587,5.754902,want answer question click answer clicking ans...
4,3.0,46,5,299,6.361702,global market gold price edged today disappoin...


In [114]:
labels_train = ready_data[:split]['SECTION']

In [256]:
mapper = DataFrameMapper([
#     (list(ready_data.columns)[1:4],None),
    ('noun_String',CountVectorizer(binary=True, ngram_range=(0,1))),
#     ('noun_String',CountVectorizer(binary=True, ngram_range=(0,1))),
    ('noun_String',TfidfVectorizer(smooth_idf=True)),
    ('adj_String',CountVectorizer(binary=True, ngram_range=(0,1))),
#     ('noun_String',CountVectorizer(binary=True, ngram_range=(0,1))),
    ('adj_String',TfidfVectorizer(smooth_idf=True))
])

In [257]:
X_mapped = mapper.fit_transform(ready_data)

In [258]:
train_mapped,test_mapped = X_mapped[:split,:],X_mapped[split:,:]

In [259]:
X_train,X_test,Y_train,Y_test=train_test_split(train_mapped,labels_train,test_size=0.1,random_state=42)

In [62]:
from sklearn.neural_network import MLPClassifier

In [260]:
clf_neural = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(200,100), random_state=1)

In [261]:
clf_neural.fit(X_train,Y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(200, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [241]:
Y_pred_neural_train = clf_neural.predict(X_train)
print confusion_matrix(Y_pred_neural_train,Y_train)
print accuracy_score(Y_pred_neural_train,Y_train)

[[1499    0    0    0]
 [   0 2447    0    0]
 [   0    0 1714    0]
 [   0    0    0 1089]]
1.0


In [249]:
Y_pred_neural_test = clf_neural.predict(X_test)
print confusion_matrix(Y_pred_neural_test,Y_test)
print accuracy_score(Y_pred_neural_test,Y_test)

[[154   5   4   1]
 [  2 249   3   7]
 [  2   2 185   1]
 [  4   1   0 130]]
0.9573333333333334


In [129]:
pred_test_neural = clf_neural.predict(test_mapped)
pred_df = pd.DataFrame(pred_test_neural,columns=['SECTION'])
pred_df.to_csv(data_path+'/submission_nl12.csv',index=False)

In [198]:
from sklearn.ensemble import RandomForestClassifier

In [212]:
model_rf = RandomForestClassifier(n_estimators=100)

In [233]:
model_rf.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [234]:
Y_pred_rf_test = model_rf.predict(X_test) #0.956
print confusion_matrix(Y_test,Y_pred_rf_test)
print accuracy_score(Y_test,Y_pred_rf_test)

[[154   2   4   2]
 [  2 250   5   0]
 [  4   1 186   1]
 [  2   9   3 125]]
0.9533333333333334


In [215]:
Y_pred_rf_train = model_rf.predict(X_train)
print confusion_matrix(Y_train,Y_pred_rf_train)
print accuracy_score(Y_train,Y_pred_rf_train)

[[1499    0    0    0]
 [   0 2447    0    0]
 [   0    0 1714    0]
 [   0    0    0 1089]]
1.0


In [268]:
processed_data_new = pd.read_csv('D:/Hands on ML/MachineHack News Category/art_clss_backup_art_clss_processed_data5.csv')

In [271]:
processed_data_new['STORY_LEM2_noverbpron'] = processed_data_new['STORY_LEM2_noverbpron'].apply(lambda x: " ".join(x for x in x.split() if len(x) > 2))

In [401]:
global noun_corpus_dict
noun_corpus_dict = {
    'noun_corpus_0':[],
    'noun_corpus_1':[],
    'noun_corpus_2':[],
    'noun_corpus_3':[]        
}
def specific_pos_corpus(x, flag,sec):
#     try:
    global noun_corpus_dict
    corpus = []
    current = noun_corpus_dict['noun_corpus_'+sec]
    wiki = TextBlob(x)
    for tup in wiki.tags:
        ppo = list(tup)[1]
        if ppo in pos_family[flag]:
            corpus = np.append(corpus,list(tup)[0])
    current.extend(corpus)
    noun_corpus_dict['noun_corpus_'+sec] = list(current)
#     except:
#         pass

In [404]:
processed_data_new.loc[processed_data_new['SECTION']==0]['STORY_LEM2_noverbpron'].apply(lambda x: specific_pos_corpus(x, 'noun','0'))
processed_data_new.loc[processed_data_new['SECTION']==1]['STORY_LEM2_noverbpron'].apply(lambda x: specific_pos_corpus(x, 'noun','1'))
processed_data_new.loc[processed_data_new['SECTION']==2]['STORY_LEM2_noverbpron'].apply(lambda x: specific_pos_corpus(x, 'noun','2'))
processed_data_new.loc[processed_data_new['SECTION']==3]['STORY_LEM2_noverbpron'].apply(lambda x: specific_pos_corpus(x, 'noun','3'))

0       None
2       None
4       None
6       None
7       None
11      None
14      None
17      None
22      None
35      None
36      None
39      None
42      None
48      None
50      None
53      None
54      None
62      None
64      None
65      None
78      None
106     None
109     None
118     None
122     None
123     None
131     None
137     None
142     None
144     None
        ... 
7331    None
7332    None
7336    None
7339    None
7342    None
7346    None
7348    None
7350    None
7356    None
7357    None
7358    None
7364    None
7375    None
7381    None
7396    None
7398    None
7402    None
7405    None
7412    None
7437    None
7440    None
7442    None
7448    None
7452    None
7464    None
7473    None
7476    None
7480    None
7487    None
7489    None
Name: STORY_LEM2_noverbpron, Length: 1229, dtype: object

In [405]:
noun_corpus_dict

{'noun_corpus_0': [u'opposition',
  u'alliance',
  u'jharkhand',
  u'mukti',
  u'jmm',
  u'jharkhand',
  u'vikas',
  u'morcha',
  u'prajatantrik',
  u'story',
  u'wire',
  u'agency',
  u'modification',
  u'text',
  u'headline',
  u'statement',
  u'air',
  u'strike',
  u'janata',
  u'party',
  u'bjp',
  u'bag',
  u'seat',
  u'attack',
  u'twist',
  u'intent',
  u'india',
  u'air',
  u'strike',
  u'terrorist',
  u'training',
  u'camp',
  u'jem',
  u'statement',
  u'context',
  u'situation',
  u'bjp',
  u'month',
  u'time',
  u'bjp',
  u'seat',
  u'leadership',
  u'modi',
  u'yeddyurappa',
  u'twitter',
  u'control',
  u'damage',
  u'statement',
  u'day',
  u'game',
  u'nation',
  u'tension',
  u'situation',
  u'day',
  u'age',
  u'note',
  u'india',
  u'pakistan',
  u'pti',
  u'party',
  u'country',
  u'twitter',
  u'hashtag',
  u'election',
  u'agriculture',
  u'form',
  u'part',
  u'poll',
  u'bjp',
  u'banking',
  u'scheme',
  u'msp',
  u'promise',
  u'india',
  u'vote',
  u'party',
 

In [421]:
def count_pos_sec(x,flag,sec):
    cnt = 0
    for word in x.split(' '):
        if word in noun_corpus_dict['noun_corpus_'+sec]:
            cnt = cnt + 1
    return cnt

In [424]:
processed_data_new['noun_cnt_0'] = processed_data_new['STORY_LEM2_noverbpron'].apply(lambda x: count_pos_sec(x,'noun','0'))
# processed_data_new['STORY_LEM2_noverbpron'].apply(lambda x: count_pos_sec(x,'noun','0'))

In [427]:
processed_data_new.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,STORY,SECTION,STORY_LEM,STORY_LEM2,STORY_LEM2_noverbpron,noun_String,adj_String,adv_String,noun_cnt_0,noun_cnt_1,noun_cnt_2,noun_cnt_3
0,0,0,painful huge reversal fee income unheard amon...,3.0,painful huge reversal fee income unheard among...,painful huge reversal fee income unheard among...,painful huge reversal fee income among private...,reversal income sector bank fee loan deal boo...,painful huge fee unheard private structured u...,lender essentially rather well,45,45,38,48
1,1,1,formidable opposition alliance among congress ...,0.0,formidable opposition alliance among congress ...,formidable opposition alliance among congress ...,formidable opposition alliance among congress ...,opposition alliance jharkhand mukti jharkhand...,formidable congress jmm,,10,6,3,3
2,2,2,asian currencies trading lower today south ko...,3.0,asian currency trading lower today south korea...,asian currency trading lower today south korea...,asian currency lower today south korean china ...,currency today china china ringgit rupiah dol...,asian lower south korean renminbi malaysian i...,offshore however close,12,16,11,17
3,3,3,want answer question click answer clicking...,1.0,want answer question click answer clicking ans...,want answer question click answer clicking ans...,answer question click answer answer user answe...,question click answer answer proceed answer v...,answer user ask neighbour many icon follow an...,also reply already,28,31,31,20
4,4,4,global markets gold prices edged today disapp...,3.0,global market gold price edged today disappoin...,global market gold price edged today disappoin...,global market gold price today chinese factory...,market gold price today factory activity data...,global chinese global european weaker asian,,19,21,19,22


In [426]:
processed_data_new['noun_cnt_1'] = processed_data_new['STORY_LEM2_noverbpron'].apply(lambda x: count_pos_sec(x,'noun','1'))
processed_data_new['noun_cnt_2'] = processed_data_new['STORY_LEM2_noverbpron'].apply(lambda x: count_pos_sec(x,'noun','2'))
processed_data_new['noun_cnt_3'] = processed_data_new['STORY_LEM2_noverbpron'].apply(lambda x: count_pos_sec(x,'noun','3'))

In [428]:
ready_data = processed_data_new

In [435]:
processed_data_new.to_csv(data_path + '/process_data4.csv')

In [284]:
# processed_data_new[split+2:]['SECTION'].unique

In [429]:
labels_train = ready_data[:split+2]['SECTION']

In [459]:
mapper = DataFrameMapper([
    (['noun_cnt_0','noun_cnt_1','noun_cnt_2','noun_cnt_3'],None),
    ('STORY_LEM2',CountVectorizer(binary=True, ngram_range=(0,1))),
#     ('noun_String',CountVectorizer(binary=True, ngram_range=(0,1))),
    ('STORY_LEM2',TfidfVectorizer(smooth_idf=True)),
#     ('adj_String',CountVectorizer(binary=True, ngram_range=(0,1))),
     ('noun_String',TfidfVectorizer(smooth_idf=True)),
#     ('adj_String',TfidfVectorizer(smooth_idf=True))
])

In [None]:
X_mapped = mapper.fit_transform(ready_data)
train_mapped,test_mapped = X_mapped[:split+2,:],X_mapped[split+2:,:]
X_train,X_test,Y_train,Y_test=train_test_split(train_mapped,labels_train,test_size=0.1,random_state=42)

In [294]:
from sklearn.naive_bayes import MultinomialNB,GaussianNB

In [451]:
model_naive_bayes = MultinomialNB()
model_naive_bayes.fit(X_train,Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [452]:
Y_pred_nb_test = model_naive_bayes.predict(X_test)
print confusion_matrix(Y_pred_nb_test,Y_test)
print accuracy_score(Y_pred_nb_test,Y_test)

[[153   1  10   1]
 [  4 276   9   4]
 [  0   0 175   0]
 [  0   0   0 118]]
0.9613848202396804


In [None]:
clf_neural = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(300,200), random_state=1)
clf_neural.fit(X_train,Y_train)

In [None]:
Y_pred_neural_test = clf_neural.predict(X_test)
print confusion_matrix(Y_pred_neural_test,Y_test)
print accuracy_score(Y_pred_neural_test,Y_test)

In [437]:
pred_test_neural = clf_neural.predict(test_mapped)
pred_df = pd.DataFrame(pred_test_neural,columns=['SECTION'])
pred_df.to_csv(data_path+'/submission_nl17.csv',index=False)

In [303]:
model_naive_bayes.predict_proba(X_test)[0]

array([7.52709155e-30, 1.00000000e+00, 5.10148338e-29, 3.04408206e-30])

In [440]:
pred_prob = clf_neural.predict_proba(X_test)

In [441]:
pred_prob1 = clf_neural_lbfgs.predict_proba(X_test)

In [442]:
# max(pred_prob1[10])
pred_prob.shape

(751L, 4L)

In [323]:
print int(np.where(pred_prob1[10] == np.amax(pred_prob1[10]))[0])

2


In [443]:
final_pred = np.empty([0,751],int)

In [444]:
for (i,j) in zip(pred_prob,pred_prob1):
    if np.max(i) > np.max(j):
        final_pred = np.append(final_pred,int(np.where(i == np.amax(i))[0]))
    else:
        final_pred = np.append(final_pred,int(np.where(j == np.amax(j))[0]))        

In [445]:
final_pred.shape

(751L,)

In [446]:
print confusion_matrix(final_pred,Y_test)
print accuracy_score(final_pred,Y_test)

[[153   0   1   1]
 [  2 275   0   1]
 [  2   1 193   0]
 [  0   1   0 121]]
0.9880159786950732


In [455]:
clf_neural_lbfgs = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(200,100), random_state=1) #200100 0.9853528628495339
clf_neural_lbfgs.fit(X_train,Y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(200, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [456]:
Y_pred_neural_lbfgs_test = clf_neural_lbfgs.predict(X_test)
print confusion_matrix(Y_pred_neural_lbfgs_test,Y_test)
print accuracy_score(Y_pred_neural_lbfgs_test,Y_test)

[[154   1   2   1]
 [  1 275   0   1]
 [  2   0 192   0]
 [  0   1   0 121]]
0.9880159786950732


In [340]:
from xgboost import XGBClassifier

In [343]:
xgb = XGBClassifier(learning_rate=0.1, n_estimators=300, max_depth=6,
                        min_child_weight=3, gamma=0.2, subsample=0.6, colsample_bytree=1.0,
                        objective='multi:softprob', nthread=4, scale_pos_weight=1, seed=27)
xgb.fit(X_train,Y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1.0, gamma=0.2, learning_rate=0.1,
       max_delta_step=0, max_depth=6, min_child_weight=3, missing=None,
       n_estimators=300, n_jobs=1, nthread=4, objective='multi:softprob',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=27, silent=True, subsample=0.6)

In [344]:
Y_pred_xgb_test = xgb.predict(X_test)
print confusion_matrix(Y_pred_xgb_test,Y_test)
print accuracy_score(Y_pred_xgb_test,Y_test)

[[146   1   4   1]
 [  4 265   2   5]
 [  7   9 188   1]
 [  0   2   0 116]]
0.952063914780293


In [345]:
xgb.predict_proba(X_test)

array([[6.9680988e-05, 9.9940670e-01, 4.6338572e-04, 6.0231043e-05],
       [1.5743130e-03, 9.6106380e-01, 3.6740482e-02, 6.2134519e-04],
       [6.4385858e-05, 7.4668112e-03, 3.7819678e-05, 9.9243098e-01],
       ...,
       [1.7505150e-02, 8.5047024e-01, 1.2503280e-01, 6.9918684e-03],
       [1.8097039e-02, 6.2128139e-01, 4.7325224e-02, 3.1329635e-01],
       [9.9996662e-01, 1.0311024e-05, 2.1462902e-05, 1.5575426e-06]],
      dtype=float32)

In [369]:
del final
final = np.empty([0,751],int)

In [370]:
final.shape

(0L, 751L)

In [349]:
from scipy import stats

In [372]:
for i in range(0,len(Y_test)):
    final = np.append(final, int(stats.mode([Y_pred_nb_test[i],Y_pred_neural_test[i],Y_pred_xgb_test[i],Y_pred_neural_lbfgs_test[i]])[0]))

In [367]:
final

array([1, 1, 3, 2, 2, 1, 1, 2, 1, 1, 2, 1, 0, 0, 2, 1, 1, 2, 2, 3, 1, 1,
       1, 1, 1, 2, 1, 0, 3, 2, 3, 2, 3, 0, 1, 2, 1, 2, 2, 2, 1, 0, 2, 1,
       0, 3, 2, 2, 3, 3, 2, 0, 1, 2, 0, 1, 1, 0, 1, 3, 2, 1, 1, 1, 1, 0,
       1, 3, 2, 0, 0, 1, 0, 1, 2, 1, 1, 2, 1, 3, 1, 3, 0, 1, 1, 0, 3, 1,
       3, 3, 1, 0, 1, 1, 3, 0, 1, 1, 2, 2, 3, 2, 2, 2, 1, 0, 2, 0, 2, 1,
       0, 0, 1, 3, 3, 2, 1, 1, 2, 3, 0, 0, 0, 2, 0, 3, 1, 0, 2, 0, 3, 2,
       2, 2, 2, 1, 0, 3, 3, 1, 1, 1, 0, 0, 3, 2, 0, 1, 0, 3, 0, 1, 2, 2,
       2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 3, 0, 2, 3, 2, 1, 0, 0, 1, 2, 1, 3,
       0, 1, 1, 3, 1, 3, 1, 2, 1, 2, 1, 1, 0, 1, 2, 2, 0, 3, 0, 3, 1, 0,
       1, 1, 2, 2, 2, 2, 0, 1, 1, 1, 3, 2, 0, 1, 1, 0, 1, 2, 1, 2, 0, 1,
       1, 1, 0, 3, 1, 3, 0, 1, 2, 1, 0, 2, 1, 3, 0, 1, 0, 0, 1, 2, 1, 3,
       0, 0, 0, 2, 1, 2, 0, 0, 2, 1, 1, 2, 1, 1, 2, 3, 0, 1, 2, 1, 1, 2,
       0, 1, 1, 0, 2, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 2, 1, 1, 3, 2, 0, 1,
       3, 1, 1, 1, 0, 2, 1, 3, 3, 1, 0, 1, 3, 3, 0,

In [373]:
print confusion_matrix(final,Y_test)
print accuracy_score(final,Y_test)

[[154   2   2   1]
 [  3 271   3   2]
 [  0   2 189   0]
 [  0   2   0 120]]
0.9773635153129161
