### https://radimrehurek.com/gensim/models/doc2vec.html


In [1]:
import gensim
from gensim import utils

import random
import os
from collections import Counter
from pprint import pprint
import numpy as np
import pandas as pd

from imblearn.over_sampling import RandomOverSampler,SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.metrics import classification_report_imbalanced
import imblearn

import nltk.stem as stem
from nltk.tokenize import word_tokenize
from nltk.tokenize import wordpunct_tokenize

from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

In [3]:
def get_train_test(rootpath):
    '''
    input : rootpath(str) 是 google root category的路径
    return : X_train,X_test,y_train,y_test. 21个类别的数据，70%训练，30%测试
    '''
    origin_data_tmp = [os.path.join(root_path,i) for i in os.listdir(os.path.join(root_path))]
    origin_data=[]
    for i in origin_data_tmp:
        if 'new' not in i and 'check' not in i:
            origin_data.append(i)
    X=[]
    y=[]
    label=0
    for i in origin_data:
        if i=="./data/dataset/.ipynb_checkpoints":
            continue
        data = open(i).readlines()
        #print(len(data))
        for j in data:
            X.append(j)
            y.append(label)
        label+=1
    X_train , X_test , y_train  , y_test = train_test_split(X,y,test_size=0.3,stratify =y) 
    return X_train,X_test,y_train,y_test



def get_class_map(rootpath):
    '''
    input : rootpath(str) 是 google root category的路径
    return : 一个字典， key是类别号(0-20),value 是 对应的google root category的名字
    '''
    origin_data_tmp = [i for i in os.listdir(os.path.join(root_path))]
    origin_data=[]
    for i in origin_data_tmp:
        if 'new' not in i and 'check' not in i:
            origin_data.append(i)
    class_dict = dict([[i,origin_data[i].split('.')[0]] for i in range(len(origin_data))])
    return class_dict



class tfidf_rf_pipe(object):
    '''
    训练一个pipeline，可以将ebay category分类到21个google root category中的一个。
    1.先用TF-IDF将一条category转化成稀疏的vector
    2.用random forest 分类模型进行分类。
    '''
    
    def __init__(self):
        '''
        定义pipeline
        '''
        self.pipeline = make_pipeline(TfidfVectorizer(), RandomForestClassifier(n_estimators=20,n_jobs=8))
    
    def train(self,X_train,y_train):
        '''
        训练模型
        输入: X_train,y_train(list)
        '''
        self.pipeline.fit(X_train, y_train)
        print('train complete!')
    
    def validation(self,X_val,y_val):
        '''
        验证模型的准确度
        输入:X_val,y_val(list)
        返回:模型的各项指标，如准确率，召回率，特异性，几何平均值和指标平衡准确度
        '''
        y_pred = self.pipeline.predict(X_val)
        res = classification_report_imbalanced(y_val, y_pred,target_names=[class_map[i] for i in class_map])
        return res
    
    def predict(self,X_test):
        '''
        预测新的ebay category属于哪一类google first category.
        输入:X_test(list)
        返回:对应的类别
        '''
        return self.pipeline.predict(X_test)

In [4]:
'''
rootpath下包括21个google root category对应的文件，每个文件中存放所有对应的ebay category
'''

root_path = './data/dataset/ebay2gg_root_origin'
X_train,X_test,y_train,y_test = get_train_test(root_path)

In [5]:
# pprint('Training class distributions summary: ')
# pprint((sorted(Counter(y_train).items())))
# pprint('Test class distributions summary: ')
# pprint((sorted(Counter(y_test).items())))

In [6]:
# help(classification_report_imbalanced)

In [7]:
class_map = get_class_map(root_path)
pprint(class_map)
tmp=[class_map[i] for i in class_map]
len(tmp)

{0: 'Office Supplies',
 1: 'Food, Beverages & Tobacco',
 2: 'Home & Garden',
 3: 'Cameras & Optics',
 4: 'Religious & Ceremonial',
 5: 'Apparel & Accessories',
 6: 'Mature',
 7: 'Business & Industrial',
 8: 'Sporting Goods',
 9: 'Hardware',
 10: 'Baby & Toddler',
 11: 'Media',
 12: 'Animals & Pet Supplies',
 13: 'Arts & Entertainment',
 14: 'Luggage & Bags',
 15: 'Software',
 16: 'Furniture',
 17: 'Toys & Games',
 18: 'Electronics',
 19: 'Vehicles & Parts',
 20: 'Health & Beauty'}


21

In [8]:
model=tfidf_rf_pipe()
model.train(X_train,y_train)
res=model.validation(X_test,y_test)
print(res)

train complete!
                                 pre       rec       spe        f1       geo       iba       sup

          Office Supplies       0.95      0.79      1.00      0.87      0.97      0.94        97
Food, Beverages & Tobacco       0.80      0.81      1.00      0.81      0.89      0.78        54
            Home & Garden       0.90      0.89      0.99      0.89      0.94      0.88       986
         Cameras & Optics       0.92      0.98      1.00      0.95      0.96      0.92       184
   Religious & Ceremonial       1.00      0.44      1.00      0.62      1.00      1.00         9
    Apparel & Accessories       0.98      0.97      1.00      0.98      0.99      0.97      1744
                   Mature       0.89      0.80      1.00      0.84      0.94      0.88        10
    Business & Industrial       0.90      0.87      0.99      0.88      0.94      0.88      1064
           Sporting Goods       0.95      0.95      1.00      0.95      0.97      0.95      1087
             

In [9]:
# new_x_test = ['Pottery & Glass:Pottery & China:China & Dinnerware:Meakin J. & G.',
#              'Sporting Goods:Outdoor Sports:Climbing & Caving:Books & Video']

# pred_label = model.predict(new_x_test)
# for i in pred_label:
#     print(class_map[i])
    

In [10]:
class similar_model(object):
    def __init__(self,gpc_id2name_path):
        self.gpc_id2name_path = gpc_id2name_path
        self.s = stem.SnowballStemmer('english')
        self.sims = None
        self.tf_idf =None
        self.dictionary = None
        self.new_list = None
    def  train_tfidf(self,cate_class):
        df3 = pd.read_csv(self.gpc_id2name_path,sep='\t')
        a=df3[["GPC_NAME"]].values
        b=[[i[0], i[0].split(">")[0].strip()] for i in a]

        df_101 = pd.DataFrame(b,columns=['gg_categ',"gg_first_categ"])

        train_data = df_101.query("gg_first_categ=='{}'".format(class_map[cate_class]))
        train_data = train_data[['gg_categ']].values
        new_arr = train_data.reshape(len(train_data))
        self.new_list = list(new_arr)
        new_new_list = [i.replace(">"," ").replace("&"," ") for i in self.new_list]
        #print("Number of documents:",len(new_new_list))
        gen_docs = [[self.s.stem(w.lower()) for w in word_tokenize(text)] 
                    for text in new_new_list]
        #print(gen_docs[:10])
        self.dictionary = gensim.corpora.Dictionary(gen_docs)
        corpus = [self.dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
        self.tf_idf = gensim.models.TfidfModel(corpus)
        
        #self.sims = gensim.similarities.Similarity('./',self.tf_idf[corpus],num_features=len(self.dictionary),num_best=5)
        self.sims = gensim.similarities.Similarity('./',corpus,num_features=len(self.dictionary),num_best=5)
        print('train complete!')
        
    def predict(self,sentense):
        #查不到得处理,匹配第一个
        
        test_sample1=sentense.replace("&"," ").replace(":"," ").replace('(',' ').replace(")"," ").replace("/"," ").replace(","," ").replace("-"," ")
        query_doc = [self.s.stem(w.lower()) for w in word_tokenize(test_sample1)]
        #print(query_doc)
        query_doc_bow = self.dictionary.doc2bow(query_doc)
        #print(query_doc_bow)
        query_doc_tf_idf = self.tf_idf[query_doc_bow]
        #print(query_doc_tf_idf)
        restmp1=self.sims[query_doc_tf_idf]
        #restmp2 = np.array(res,dtype="int64")[:,0:1].reshape(len(res))
        if len(restmp1)==0:
            top_5_result = [self.new_list[0]]
        else :
            top_5_result = [self.new_list[restmp1[i][0]] for i in range(len(restmp1))]
        
        
        
#         print("ebay_category    :   "+ sentense)
#         for i in range(len(top_5_result)):
#             print("google_category_" + str(i+1) + ":   "+ top_5_result[i]) 
            
        return top_5_result
        
    


In [13]:
# gpc_id2name_path = './data/gpc_id2name.tsv'

# model2 = similar_model(gpc_id2name_path)
# model2.train_tfidf(pred_label[0])

In [15]:
# top5_res= model2.predict(new_x_test[0])
# top5_res

### load ebat alive data

In [11]:
#读取 ebay category数据,获得 us 数据
df = pd.read_csv('./data/ares-presto_run_4_stmt_1_0.csv',sep=',')
df = df[df["site_id"] == 0]
df=df.query("leaf_categ_id==move_to")
print(len(df))
test = df[['leaf_categ_name']].values
test =  test.reshape(len(test))

16996


In [14]:
i = 10
test[i:i+30]

array([ 'Business & Industrial:Healthcare, Lab & Life Science:Medical Specialties:Emergency & EMT:Training & Manikins',
       'Business & Industrial:Healthcare, Lab & Life Science:Medical Specialties:Emergency & EMT:Other Emergency & EMT',
       'Business & Industrial:Healthcare, Lab & Life Science:Lab Supplies:Lab Kits & Sets',
       'Business & Industrial:Healthcare, Lab & Life Science:Lab Supplies:Plasticware',
       'Toys & Hobbies:Radio Control & Control Line:RC Model Vehicle Parts & Accs:Engine, Exhaust & Fuel Systems:Electric Motors',
       'Pottery & Glass:Glass:Glassware:Contemporary Glass:Degenhart',
       'Business & Industrial:Electrical & Test Equipment:Connectors, Switches & Wire:Wire & Cable:Telecom Wire & Cable',
       'Business & Industrial:Electrical & Test Equipment:Connectors, Switches & Wire:Wire & Cable:Magnet/Enameled Wire',
       'Business & Industrial:Electrical & Test Equipment:Electrical Supply Equipment:Electr. Supply Books & Manuals',
       'Busine

In [82]:
index = 888

pred_label = model.predict(test[index:index+1])


gpc_id2name_path = './data/gpc_id2name.tsv'
model2 = similar_model(gpc_id2name_path)
model2.train_tfidf(pred_label[0])


train complete!


In [83]:
top5_res = model2.predict(test[index])
print(test[index])
print()
for i in top5_res:
    print(i)

Collectibles:Vintage, Retro, Mid-Century:Plastic

Home & Garden > Kitchen & Dining > Food Storage > Food Wraps > Plastic Wrap
