In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from sklearn import metrics
import pandas as pd
from sklearn import svm
import gensim
import gensim.parsing
import os
%matplotlib inline

In [2]:
train_df = pd.read_csv('train.tsv',delimiter='\t')

In [3]:
train_df.head(5)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [4]:
train_df.isnull().sum()

train_id                  0
name                      0
item_condition_id         0
category_name          6327
brand_name           632682
price                     0
shipping                  0
item_description          4
dtype: int64

In [5]:
train_df = train_df.dropna(subset=['category_name'])

In [6]:
train_df.isnull().sum()

train_id                  0
name                      0
item_condition_id         0
category_name             0
brand_name           629225
price                     0
shipping                  0
item_description          4
dtype: int64

In [8]:
def get_first_category_name(x):
    category_name = str(x['category_name'])
    return category_name.split('/')[0]

In [9]:
# category_nameの第一カテゴリを抜き取る
#train_df['category_name']=train_df[['category_name']].fillna("Unknown")
train_df['first_category_name'] =  train_df.apply(lambda x: get_first_category_name(x), axis=1)

In [10]:
train_df['first_category_name'].value_counts()

Women                     664385
Beauty                    207828
Kids                      171689
Electronics               122690
Men                        93680
Home                       67871
Vintage & Collectibles     46530
Other                      45351
Handmade                   30842
Sports & Outdoors          25342
Name: first_category_name, dtype: int64

In [11]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

train_df['onehot_first_category_name'] = train_df[['first_category_name']].apply(le.fit_transform)

In [12]:
key = np.arange(le.classes_.shape[0])
le_dict = dict(zip(key,le.classes_))

In [13]:
le_dict

{0: 'Beauty',
 1: 'Electronics',
 2: 'Handmade',
 3: 'Home',
 4: 'Kids',
 5: 'Men',
 6: 'Other',
 7: 'Sports & Outdoors',
 8: 'Vintage & Collectibles',
 9: 'Women'}

In [15]:
train_df.head(5)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,first_category_name,onehot_first_category_name
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,Men,5
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,Electronics,1
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,Women,9
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,Home,3
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,Women,9


In [101]:
itemname = list(train_df['name'][:100000])

In [102]:
pre_itemname = []
# gensim.parsing.preprocess_string(文字列)は以下の関数を実行した結果を返してくれる
# strip_tags() : <a>hoge</hoge>といったhtmlのタグを消してくれる
# strip_punctuation() : ,や!といった記号を消してくれる
# strip_multiple_whitespaces() : \rや\nといった空白文字を消してくれる
# strip_numeric() : 文字列の中にある数字を消してくれる
# remove_stopwords() : am や but　といったストップワードと呼ばれる単語を削除してくれる
# strip_short() : 一定の長さを持たない単語を削除する(デフォルトでは３)
# stem_text() : ステミング(useやuseful,usingなら -> us)を行ってくれる
for i in itemname:
    pre_itemname.append(gensim.parsing.preprocess_string(i))

In [103]:
for i in range(0,5):
    print(itemname[i])
    print(pre_itemname[i],'\n')

MLB Cincinnati Reds T Shirt Size XL
['mlb', 'cincinnati', 'red', 'shirt', 'size'] 

Razer BlackWidow Chroma Keyboard
['razer', 'blackwidow', 'chroma', 'keyboard'] 

AVA-VIV Blouse
['ava', 'viv', 'blous'] 

Leather Horse Statues
['leather', 'hors', 'statu'] 

24K GOLD plated rose
['gold', 'plate', 'rose'] 



In [104]:
import collections
import re

def TF(desc,terms,norm=False):
    tf_dict = {}
    d = desc.lower()
    for term in terms:
        i = 0
        for m in re.finditer(term,d):
            i += 1
        if i > 0 and norm==False:
            tf_dict[term] = i
        elif i > 0 and norm==True:
            tf_dict[term] = i/len(set(d.split(' ')))
    return tf_dict

In [105]:
tf_dict = {}
for i in range(0,len(pre_itemname)):
    tf_dict[i] = TF(itemname[i],pre_itemname[i],norm=False)

In [106]:
for i in range(0,5):
    print(tf_dict[i])

{'mlb': 1, 'cincinnati': 1, 'red': 1, 'shirt': 1, 'size': 1}
{'razer': 1, 'blackwidow': 1, 'chroma': 1, 'keyboard': 1}
{'ava': 1, 'viv': 1, 'blous': 1}
{'leather': 1, 'hors': 1, 'statu': 1}
{'gold': 1, 'plate': 1, 'rose': 1}


In [107]:
def get_terms_dict(doc):    
    d = collections.defaultdict(int)
    for i in doc:
        # 重複はカウントしない
        terms = set(i)
        for term in terms:
            d[term] += 1
    return d

In [108]:
def IDF(terms_dict,term,doc_num):
    return (1+np.log2(doc_num/terms_dict[term]))

In [109]:
doc_num = len(pre_itemname)
terms_dict = get_terms_dict(pre_itemname)

In [110]:
terms_dict['mlb']

21

In [111]:
idf_dict = {}
for k,v in terms_dict.items():
    idf_dict[k] = IDF(terms_dict,k,doc_num)

In [112]:
for i in range(0,5):
    for k,v in tf_dict[i].items():
        print("{0} : \nコーパスでの出現回数 : {1}, IDF値 : {2}\n".format(k,terms_dict[k],idf_dict[k]))

mlb : 
コーパスでの出現回数 : 21, IDF値 : 13.21732305165805

cincinnati : 
コーパスでの出現回数 : 14, IDF値 : 13.802285552379209

red : 
コーパスでの出現回数 : 932, IDF値 : 7.745454329782532

shirt : 
コーパスでの出現回数 : 2874, IDF値 : 6.120796127979475

size : 
コーパスでの出現回数 : 5043, IDF値 : 5.309573964479489

razer : 
コーパスでの出現回数 : 11, IDF値 : 14.150208855799514

blackwidow : 
コーパスでの出現回数 : 3, IDF値 : 16.024677973715654

chroma : 
コーパスでの出現回数 : 6, IDF値 : 15.024677973715656

keyboard : 
コーパスでの出現回数 : 48, IDF値 : 12.024677973715656

ava : 
コーパスでの出現回数 : 10, IDF値 : 14.287712379549449

viv : 
コーパスでの出現回数 : 6, IDF値 : 15.024677973715656

blous : 
コーパスでの出現回数 : 367, IDF値 : 9.090004221593599

leather : 
コーパスでの出現回数 : 634, IDF値 : 8.301301444297405

hors : 
コーパスでの出現回数 : 34, IDF値 : 12.522177633186473

statu : 
コーパスでの出現回数 : 16, IDF値 : 13.609640474436812

gold : 
コーパスでの出現回数 : 1176, IDF値 : 7.409968129600448

plate : 
コーパスでの出現回数 : 137, IDF値 : 10.511608391476285

rose : 
コーパスでの出現回数 : 613, IDF値 : 8.349897210746029



In [113]:
def TFIDF(tf,idf):
    tfidf_dict = {}
    tf_list = list(tf.items())
    for t in tf_list:
        tmp_dict = {}
        for k,v in t[1].items():
            tmp_dict[k] = v*idf[k]
        tfidf_dict[t[0]] = tmp_dict
    return tfidf_dict

In [114]:
tfidf_dict = TFIDF(tf_dict,idf_dict)

In [115]:
for i in range(0,5):
    print("TFIDF値 : {0}\n".format(tfidf_dict[i]))

TFIDF値 : {'mlb': 13.217323051658051, 'cincinnati': 13.802285552379209, 'red': 7.7454543297825316, 'shirt': 6.1207961279794754, 'size': 5.3095739644794886}

TFIDF値 : {'razer': 14.150208855799514, 'blackwidow': 16.024677973715654, 'chroma': 15.024677973715656, 'keyboard': 12.024677973715656}

TFIDF値 : {'ava': 14.287712379549449, 'viv': 15.024677973715656, 'blous': 9.090004221593599}

TFIDF値 : {'leather': 8.3013014442974047, 'hors': 12.522177633186473, 'statu': 13.609640474436812}

TFIDF値 : {'gold': 7.4099681296004478, 'plate': 10.511608391476285, 'rose': 8.349897210746029}



In [120]:
for i in range(0,5):
    print(itemname[i],'\n')
    print('TFIDF : {0}\n'.format(tfidf_dict[i]))
    print('TF : {0}\n'.format(tf_dict[i]))

MLB Cincinnati Reds T Shirt Size XL 

TFIDF : {'mlb': 13.217323051658051, 'cincinnati': 13.802285552379209, 'red': 7.7454543297825316, 'shirt': 6.1207961279794754, 'size': 5.3095739644794886}

TF : {'mlb': 1, 'cincinnati': 1, 'red': 1, 'shirt': 1, 'size': 1}

Razer BlackWidow Chroma Keyboard 

TFIDF : {'razer': 14.150208855799514, 'blackwidow': 16.024677973715654, 'chroma': 15.024677973715656, 'keyboard': 12.024677973715656}

TF : {'razer': 1, 'blackwidow': 1, 'chroma': 1, 'keyboard': 1}

AVA-VIV Blouse 

TFIDF : {'ava': 14.287712379549449, 'viv': 15.024677973715656, 'blous': 9.090004221593599}

TF : {'ava': 1, 'viv': 1, 'blous': 1}

Leather Horse Statues 

TFIDF : {'leather': 8.3013014442974047, 'hors': 12.522177633186473, 'statu': 13.609640474436812}

TF : {'leather': 1, 'hors': 1, 'statu': 1}

24K GOLD plated rose 

TFIDF : {'gold': 7.4099681296004478, 'plate': 10.511608391476285, 'rose': 8.349897210746029}

TF : {'gold': 1, 'plate': 1, 'rose': 1}



In [121]:
itemname_corpora_dict = gensim.corpora.Dictionary(pre_itemname)
print(len(itemname_corpora_dict))

19797


In [122]:
# 自分で作ったコーパス内での用語出現回数リストとライブラリで作ったものの用語出現回数リストが合致しているかチェック
itemname_corpora_dict.dfs[itemname_corpora_dict.token2id['new']],terms_dict['new']

(5103, 5103)

In [123]:
bow_dict = {}
for i in range(0,len(pre_itemname)):
    bow_dict[i] = itemname_corpora_dict.doc2bow(pre_itemname[i])

In [124]:
bow_dict[2],pre_itemname[2]

([(9, 1), (10, 1), (11, 1)], ['ava', 'viv', 'blous'])

In [125]:
tfidf_model = gensim.models.TfidfModel(bow_dict.values(),normalize=False)
tfidf_corpus = tfidf_model[bow_dict.values()]

In [126]:
print("My TFIDF value \n {0}".format(tfidf_dict[2]))
print("gensim libraly TFIDF value")
for k,v in tfidf_dict[2].items():
    for i in tfidf_model[bow_dict[2]]:
        if i[0] == itemname_corpora_dict.token2id[k]:
            print(k,i[1])
print("\n My TFIDF value \n {0}".format(tfidf_dict[3]))
print("gensim libraly TFIDF value")
for k,v in tfidf_dict[3].items():
    for i in tfidf_model[bow_dict[3]]:
        if i[0] == itemname_corpora_dict.token2id[k]:
            print(k,i[1])

My TFIDF value 
 {'ava': 14.287712379549449, 'viv': 15.024677973715656, 'blous': 9.090004221593599}
gensim libraly TFIDF value
ava 13.2877123795
viv 14.0246779737
blous 8.09000422159

 My TFIDF value 
 {'leather': 8.3013014442974047, 'hors': 12.522177633186473, 'statu': 13.609640474436812}
gensim libraly TFIDF value
leather 7.3013014443
hors 11.5221776332
statu 12.6096404744


In [127]:
itemname_corpora_dict.filter_extremes(no_below=3,no_above=0.6)
bow_dict = {}
for i in range(0,len(pre_itemname)):
    bow_dict[i] = itemname_corpora_dict.doc2bow(pre_itemname[i])

In [128]:
tfidf_model = gensim.models.TfidfModel(bow_dict.values(),normalize=True)
tfidf_corpus = tfidf_model[bow_dict.values()]

In [129]:
feature_vec = gensim.matutils.corpus2dense(list(tfidf_corpus), num_terms=len(itemname_corpora_dict))
print(feature_vec.shape)

(7174, 100000)


In [130]:
feature_vec = feature_vec.T
print(feature_vec.shape)

(100000, 7174)


In [131]:
labels = np.array(list(train_df['onehot_first_category_name']))[:feature_vec.shape[0]]
feature_vec.shape,labels.shape

((100000, 7174), (100000,))

In [132]:
X_train = feature_vec[:80000]
y_train = labels[:80000]
X_test = feature_vec[80000:]
y_test = labels[80000:]

In [133]:
# TFIDFで重み付けしたもので分類
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.linear_model import SGDClassifier
clf = GridSearchCV(SGDClassifier(penalty="l2"), param_grid={'alpha':[0.01,0.05,0.001,0.0001]})

In [134]:
# StratifiedKFold(y, n_folds, shuffle=True) 交差検証用にデータを分割してくれる
for i, (itr, ite) in enumerate(StratifiedKFold(y_test,n_folds=5, shuffle=True)):
    clf.fit(X_train[itr], y_train[itr])
    train_pred = clf.predict(X_train[ite])
    train_acc = metrics.accuracy_score(train_pred,y_train[ite])
    print("{0} finished. test accuracy : {1}".format(i+1,train_acc))

1 finished. test accuracy : 0.7737262737262737
2 finished. test accuracy : 0.7909068198850862
3 finished. test accuracy : 0.78525
4 finished. test accuracy : 0.7850888166124593
5 finished. test accuracy : 0.7862862862862863


In [135]:
ypred = clf.predict(X_test)
accuracy = metrics.accuracy_score(y_test,ypred)
print(accuracy)

0.78095


In [136]:
le.classes_

array(['Beauty', 'Electronics', 'Handmade', 'Home', 'Kids', 'Men', 'Other',
       'Sports & Outdoors', 'Vintage & Collectibles', 'Women'], dtype=object)

In [137]:
print(metrics.classification_report(y_test,ypred,target_names=le.classes_))

                        precision    recall  f1-score   support

                Beauty       0.83      0.88      0.86      2862
           Electronics       0.87      0.85      0.86      1630
              Handmade       0.56      0.07      0.13       391
                  Home       0.72      0.59      0.65       943
                  Kids       0.78      0.61      0.69      2277
                   Men       0.77      0.42      0.55      1303
                 Other       0.70      0.36      0.48       587
     Sports & Outdoors       0.60      0.31      0.41       339
Vintage & Collectibles       0.43      0.19      0.26       588
                 Women       0.77      0.96      0.86      9080

           avg / total       0.77      0.78      0.76     20000



In [138]:
# 重み付けなし
feature_vec2 = gensim.matutils.corpus2dense(list(bow_dict.values()), num_terms=len(itemname_corpora_dict))

In [139]:
feature_vec2 = feature_vec2.T
feature_vec2.shape,labels.shape

((100000, 7174), (100000,))

In [140]:
X_train2 = feature_vec2[:80000]
y_train2 = labels[:80000]
X_test2 = feature_vec2[80000:]
y_test2 = labels[80000:]

In [142]:
clf = GridSearchCV(SGDClassifier(penalty='l2'), param_grid={'alpha':[0.01,0.05,0.001,0.0001]})

for i, (itr, ite) in enumerate(StratifiedKFold(y_test,n_folds=5, shuffle=True)):
    clf.fit(X_train2[itr], y_train2[itr])
    train_pred = clf.predict(X_train2[ite])
    train_acc = metrics.accuracy_score(train_pred,y_train2[ite])
    print("{0} finished. test accuracy : {1}".format(i+1,train_acc))

1 finished. test accuracy : 0.7882117882117882
2 finished. test accuracy : 0.7706719960029977
3 finished. test accuracy : 0.78125
4 finished. test accuracy : 0.7838378784088066
5 finished. test accuracy : 0.7837837837837838


In [144]:
ypred = clf.predict(X_test2)
accuracy = metrics.accuracy_score(y_test2,ypred)
print(accuracy)

0.7827


In [145]:
print(metrics.classification_report(y_test,ypred,target_names=le.classes_))

                        precision    recall  f1-score   support

                Beauty       0.88      0.86      0.87      2862
           Electronics       0.87      0.83      0.85      1630
              Handmade       0.48      0.26      0.34       391
                  Home       0.74      0.60      0.67       943
                  Kids       0.76      0.62      0.69      2277
                   Men       0.72      0.49      0.59      1303
                 Other       0.64      0.39      0.48       587
     Sports & Outdoors       0.70      0.28      0.40       339
Vintage & Collectibles       0.39      0.22      0.29       588
                 Women       0.78      0.95      0.86      9080

           avg / total       0.77      0.78      0.77     20000

