### https://radimrehurek.com/gensim/models/doc2vec.html


In [85]:
import gensim
from gensim import utils

import random
import os
from collections import Counter
from pprint import pprint
import numpy as np
import pandas as pd

from imblearn.over_sampling import RandomOverSampler,SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.metrics import classification_report_imbalanced
import imblearn

import nltk.stem as stem
from nltk.tokenize import word_tokenize
from nltk.tokenize import wordpunct_tokenize

from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

In [86]:
root_path = './data/dataset/ebay2gg_root_origin'

In [95]:
def get_train_test(rootpath):
    origin_data_tmp = [os.path.join(root_path,i) for i in os.listdir(os.path.join(root_path))]
    origin_data=[]
    for i in origin_data_tmp:
        if 'new' not in i and 'check' not in i:
            origin_data.append(i)
    X=[]
    y=[]
    label=0
    for i in origin_data:
        if i=="./data/dataset/.ipynb_checkpoints":
            continue
        data = open(i).readlines()
        #print(len(data))
        for j in data:
            X.append(j)
            y.append(label)
        label+=1
    X_train , X_test , y_train  , y_test = train_test_split(X,y,test_size=0.3,stratify =y) 
    return X_train,X_test,y_train,y_test


def get_class_map(rootpath):
    origin_data_tmp = [i for i in os.listdir(os.path.join(root_path))]
    origin_data=[]
    for i in origin_data_tmp:
        if 'new' not in i and 'check' not in i:
            origin_data.append(i)
    class_dict = dict([[i,origin_data[i].split('.')[0]] for i in range(len(origin_data))])
    return class_dict



class tfidf_rf_pipe(object):
    
    def __init__(self):
        self.pipeline = make_pipeline(TfidfVectorizer(), RandomForestClassifier(n_estimators=20,n_jobs=8))
    
    def train(self,X_train,y_train):
        '''
        注释
        '''
        self.pipeline.fit(X_train, y_train)
        print('train complete!')
    
    def validation(self,X_val,y_val):
        y_pred = self.pipeline.predict(X_val)
        res = classification_report_imbalanced(y_val, y_pred)
        return res
    
    def predict(self,X_test):
        return self.pipeline.predict(X_test)

In [88]:
X_train , X_test , y_train  , y_test = get_train_test(root_path)

In [97]:
# pprint('Training class distributions summary: ')
# pprint((sorted(Counter(y_train).items())))
# pprint('Test class distributions summary: ')
# pprint((sorted(Counter(y_test).items())))
help(tfidf_rf_pipe.train)

Help on function train in module __main__:

train(self, X_train, y_train)
    注释



In [7]:
model=tfidf_rf_pipe()
model.train(X_train,y_train)
res=model.validation(X_test,y_test)
print(res)

train complete!
                   pre       rec       spe        f1       geo       iba       sup

          0       0.98      0.90      1.00      0.94      0.99      0.97        97
          1       0.98      0.97      1.00      0.97      0.99      0.98      1744
          2       0.85      0.60      1.00      0.71      0.92      0.84        58
          3       0.92      0.95      0.97      0.93      0.95      0.90      2932
          4       0.74      0.65      1.00      0.69      0.86      0.72        54
          5       0.87      0.81      1.00      0.84      0.93      0.85       105
          6       0.79      0.74      1.00      0.77      0.89      0.78        42
          7       0.90      0.87      0.99      0.89      0.94      0.88      1064
          8       0.92      0.89      0.99      0.91      0.96      0.91       986
          9       0.58      0.47      1.00      0.52      0.76      0.56        15
         10       0.98      0.97      1.00      0.97      0.99      0.

In [9]:
new_x_test = ['Pottery & Glass:Pottery & China:China & Dinnerware:Meakin J. & G.',
             'Home & Garden:Tools:Power Tools:Buffers & Polishers']
class_map = get_class_map(root_path)
pred_label = model.predict(new_x_test)
# for i in pred_label:
#     print(class_map[i])
    

In [155]:
class similar_model(object):
    def __init__(self,gpc_id2name_path):
        self.gpc_id2name_path = gpc_id2name_path
        self.s = stem.SnowballStemmer('english')
        self.sims = None
        self.tf_idf =None
        self.dictionary = None
        self.new_list = None
    def  train_tfidf(self,cate_class):
        df3 = pd.read_csv(self.gpc_id2name_path,sep='\t')
        a=df3[["GPC_NAME"]].values
        b=[[i[0], i[0].split(">")[0].strip()] for i in a]

        df_101 = pd.DataFrame(b,columns=['gg_categ',"gg_first_categ"])

        train_data = df_101.query("gg_first_categ=='{}'".format(class_map[cate_class]))
        train_data = train_data[['gg_categ']].values
        new_arr = train_data.reshape(len(train_data))
        self.new_list = list(new_arr)
        new_new_list = [i.replace(">"," ").replace("&"," ") for i in self.new_list]
        #print("Number of documents:",len(new_new_list))
        gen_docs = [[self.s.stem(w.lower()) for w in word_tokenize(text)] 
                    for text in new_new_list]
        #print(gen_docs[:10])
        self.dictionary = gensim.corpora.Dictionary(gen_docs)
        corpus = [self.dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
        self.tf_idf = gensim.models.TfidfModel(corpus)
        
        #self.sims = gensim.similarities.Similarity('./',self.tf_idf[corpus],num_features=len(self.dictionary),num_best=5)
        self.sims = gensim.similarities.Similarity('./',corpus,num_features=len(self.dictionary),num_best=5,norm='l2')
        #print('train complete!')
        
    def predict(self,sentense):
        
        test_sample1=sentense.replace("&"," ").replace(":"," ").replace('(',' ').replace(")"," ").replace("/"," ").replace(","," ").replace("-"," ")
        query_doc = [self.s.stem(w.lower()) for w in word_tokenize(test_sample1)]
        #print(query_doc)
        query_doc_bow = self.dictionary.doc2bow(query_doc)
        #print(query_doc_bow)
        query_doc_tf_idf = self.tf_idf[query_doc_bow]
        #print(query_doc_tf_idf)
        restmp1=self.sims[query_doc_tf_idf]
        #restmp2 = np.array(res,dtype="int64")[:,0:1].reshape(len(res))
        #查不到得处理,匹配最短的
        if len(restmp1)==0:
            top_5_result = [self.new_list[0]]
        else :
            top_5_result = [self.new_list[restmp1[i][0]] for i in range(len(restmp1))]
        
        
        
#         print("ebay_category    :   "+ sentense)
#         for i in range(len(top_5_result)):
#             print("google_category_" + str(i+1) + ":   "+ top_5_result[i]) 
            
        return top_5_result
        
    


In [154]:
help(gensim.similarities.Similarity)

Help on class Similarity in module gensim.similarities.docsim:

class Similarity(gensim.interfaces.SimilarityABC)
 |  Compute cosine similarity of a dynamic query against a static corpus of documents
 |  ("the index").
 |  
 |  Scalability is achieved by sharding the index into smaller pieces, each of which
 |  fits into core memory (see the `(Sparse)MatrixSimilarity` classes in this module).
 |  The shards themselves are simply stored as files to disk and mmap'ed back as needed.
 |  
 |  Method resolution order:
 |      Similarity
 |      gensim.interfaces.SimilarityABC
 |      gensim.utils.SaveLoad
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __getitem__(self, query)
 |      Get similarities of document `query` to all documents in the corpus.
 |      
 |      **or**
 |      
 |      If `query` is a corpus (iterable of documents), return a matrix of similarities
 |      of all query documents vs. all corpus document. This batch query is more
 |      efficient than c

In [13]:
gpc_id2name_path = './data/gpc_id2name.tsv'

model2 = similar_model(gpc_id2name_path)
model2.train_tfidf(pred_label[0])

In [86]:
top5_res= model2.predict(new_x_test[0])
top5_res

['potteri', 'glass', 'potteri', 'china', 'china', 'dinnerwar', 'meakin', 'j.', 'g', '.']


[]

### load ebat alive data

In [73]:
#读取 ebay category数据,获得 us 数据
df = pd.read_csv('./data/ares-presto_run_4_stmt_1_0.csv',sep=',')
# df=df.filter('site_id==0')、
df = df[df["site_id"] == 0]
df=df.query("leaf_categ_id==move_to")
print(len(df))
test = df[['leaf_categ_name']].values
test =  test.reshape(len(test))

16996


In [153]:
# i = 10
# test[i:i+100]

In [139]:
index = 232
gpc_id2name_path = './data/gpc_id2name.tsv'

pred_label = model.predict(test[index:index+1])
model2 = similar_model(gpc_id2name_path)
model2.train_tfidf(pred_label[0])


In [140]:
top5_res= model2.predict(test[index])
print("ebay category    :",test[index])
#print(class_map[pred_label[0]])
print()

for i in range(len(top5_res)):
    print("google category_{}:".format(i),top5_res[i])

ebay category    : Jewelry & Watches:Men's Jewelry:Bolo Ties

google category_0: Apparel & Accessories > Jewelry > Watches
google category_1: Apparel & Accessories > Jewelry > Jewelry Sets
google category_2: Apparel & Accessories > Jewelry > Body Jewelry
google category_3: Apparel & Accessories > Jewelry > Watch Accessories > Watch Bands
google category_4: Apparel & Accessories > Jewelry > Watch Accessories > Watch Winders


In [143]:
df_finn = pd.read_csv('./data/ebay2gg_table')
real = df_finn.query('leaf_categ_name=="{}"'.format(test[index])).values[0][1]
print("real google category:"+real)

real google category:Apparel & Accessories > Jewelry > Watch Accessories


In [158]:
from tqdm import trange
count=0
new=0
for index in trange(100):
    pred_label = model.predict(test[index:index+1])
    model2 = similar_model(gpc_id2name_path)
    model2.train_tfidf(pred_label[0])
    top5_res= model2.predict(test[index])
    y_pred = top5_res[0]
    if len(df_finn.query('leaf_categ_name=="{}"'.format(test[index])))==0:
        new+=1
        continue
    y_real = df_finn.query('leaf_categ_name=="{}"'.format(test[index])).values[0][1]
    
    if y_real == y_pred:
        count+=1


100%|██████████| 100/100 [00:57<00:00,  1.75it/s]


In [160]:
print(count/(100-new))

0.26804123711340205


In [83]:
help(gensim.similarities.Similarity)
self.sims = gensim.similarities.Similarity('./',self.tf_idf[corpus],num_features=len(self.dictionary),num_best=5)


Help on class Similarity in module gensim.similarities.docsim:

class Similarity(gensim.interfaces.SimilarityABC)
 |  Compute cosine similarity of a dynamic query against a static corpus of documents
 |  ("the index").
 |  
 |  Scalability is achieved by sharding the index into smaller pieces, each of which
 |  fits into core memory (see the `(Sparse)MatrixSimilarity` classes in this module).
 |  The shards themselves are simply stored as files to disk and mmap'ed back as needed.
 |  
 |  Method resolution order:
 |      Similarity
 |      gensim.interfaces.SimilarityABC
 |      gensim.utils.SaveLoad
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __getitem__(self, query)
 |      Get similarities of document `query` to all documents in the corpus.
 |      
 |      **or**
 |      
 |      If `query` is a corpus (iterable of documents), return a matrix of similarities
 |      of all query documents vs. all corpus document. This batch query is more
 |      efficient than c