# INPUT DATASET ENGLISH AND TRADITIONAL-CHINESE

In [1]:
import pandas as pd
from tqdm import tqdm
import time

In [2]:
train_EN = pd.read_csv('train_en.csv')
train_EN

Unnamed: 0,product_title,category
0,Recollections Color Splash Clear Stamps & Stencil,Hobbies & Stationery
1,"soap,lotion scrub set 400",Health & Personal Care
2,Spigen Galaxy S10e Case Tough Armor Gunmetal,Mobile Accessories
3,Acrylic Lanalon Bright Red,Hobbies & Stationery
4,303 FLAT SHEET/Blanket 100% cotton,Home & Living
...,...,...
499995,rocker arm roller racing mio,Motors
499996,Secosana (preloved bag),Women's Bags
499997,jag bag,Women's Bags
499998,Baby wipes 15 sheets (Alcohol and Paraben Free...,Babies & Kids


In [3]:
train_TCN = pd.read_csv('train_tcn.csv', encoding='utf-8-sig')
train_TCN

Unnamed: 0,product_title,category
0,Gucci Gucci Guilty Pour Femme Stud Edition 罪愛女...,Health & Beauty
1,（二手）PS4 GTA 5 俠盜獵車手5 Grand Theif Auto V繁體 中文版,Game Kingdom
2,百獸卡,Life & Entertainment
3,nac nac活氧全效柔衣素,Mother & Baby
4,#Nike耐吉官方F.C. 男子足球長褲新款標準型 拒水 拉鏈褲腳\nCD0557,Men's Apparel
...,...,...
499995,Dress,Women's Apparel
499996,Lilian Lin,Food & Beverages
499997,77 抹茶杏仁乳加 77乳加 減甜 大人味 大人的77 宇治抹茶 杏仁 宇治抹茶杏仁 抹茶 ...,Food & Beverages
499998,Panasonic 國際牌 電動 牙刷頭 (EW-DM81 專用刷頭) WEW0974-W,Home Electronic


# PREPROCESS THE ENGLISH DATASET

In [4]:
import string
from nltk.tokenize import word_tokenize as WT

print("Punctuations:",string.punctuation)

Punctuations: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


##### 1. Tokenization (for deleting all stopwords)

In [5]:
%%time
train_en_tokenized = []
for words in tqdm(train_EN['product_title']):
    temp = WT(words)
    train_en_tokenized.append(temp)

len(train_en_tokenized)

100%|██████████| 500000/500000 [01:01<00:00, 8142.91it/s] 

Wall time: 1min 1s





500000

##### 2. Delete Numbers in Tokenized Result

In [6]:
import re
def number_remover(list): 
    pattern = '[0-9]'
    list = [re.sub(pattern, '', i) for i in list] 
    return list

In [7]:
train_en_tokenized_nonum = []
for tokenized_words in tqdm(train_en_tokenized):
    train_en_tokenized_nonum.append(number_remover(tokenized_words))

len(train_en_tokenized_nonum)

100%|██████████| 500000/500000 [00:04<00:00, 116890.87it/s]


500000

##### 3. Remove punctuations and multi-whitespaces, also transform the data into the lowercase form

In [8]:
def advance_remover(list_input):
    punctuations = [char for char in string.punctuation]
    separator = ' '
    joined_string = str(separator.join(list_input))
    for char in punctuations:
        joined_string = joined_string.replace(char,'') #remove special character
        joined_string = re.sub(' +', ' ', joined_string) #remove multi-whitespaces
    return joined_string.lower()

In [9]:
%%time
cleaned_data_en = []
for words in tqdm(train_en_tokenized_nonum):
    cleaned_data_en.append(advance_remover(words))

len(cleaned_data_en)

100%|██████████| 500000/500000 [00:53<00:00, 9308.15it/s] 

Wall time: 53.7 s





500000

In [10]:
# A = pd.DataFrame({'product_title':cleaned_data_en, 'category':train_EN['category']})
# A.to_csv('D:\A Shopee Code League\#4 Title Translation\cleaned_data_en.csv', index=False, encoding='utf-8-sig')

# PREPROCESS THE TRADITIONAL-CHINESE DATASET

##### References 
1. https://www.dlology.com/blog/tutorial-chinese-sentiment-analysis-with-hotel-review-data/
2. https://github.com/Tony607/Chinese_sentiment_analysis
3. https://github.com/jacquessham/tokenize_chinese_nlp
4. https://stackoverflow.com/questions/2718196/find-all-chinese-text-in-a-string-using-python-and-regex

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import jieba

Using TensorFlow backend.


In [14]:
def non_chinese_words_remover(words):
    chinese_words_only = []
    for n in re.findall('[\u4e00-\u9fff]+', words):
        chinese_words_only.append(n)
    return ''.join(chinese_words_only)

In [15]:
def chinese_tokenizer(words):
    tcn_words = non_chinese_words_remover(words)
    if tcn_words == '':
        return words
    else:
        chinese_stopwords = [line.rstrip() for line in open('D:\A Shopee Code League\#4 Title Translation\chinese-stopwords.txt',"r", encoding="utf-8")]
        for char in chinese_stopwords:
            tcn_words = tcn_words.replace(char,'')
            tcn_words = tcn_words.replace(' ','')
        temp = ' '.join(jieba.cut_for_search(tcn_words))
        return temp

In [None]:
cleaned_data_tcn = []
for chinese_words in tqdm(train_TCN['product_title']):
    cleaned_data_tcn.append(chinese_tokenizer(chinese_words))

In [60]:
# B = pd.DataFrame({'product_title':cleaned_data_tcn, 'category':train_TCN['category']})
# B.to_csv('D:\A Shopee Code League\#4 Title Translation\cleaned_data_tcn.csv', index=False, encoding='utf-8-sig')

# CHECK CATEGORY IN BOTH CLEANED DATASET

In [17]:
data_tcn = pd.read_csv('D:\A Shopee Code League\#4 Title Translation\cleaned_data_tcn.csv')
data_en  = pd.read_csv('D:\A Shopee Code League\#4 Title Translation\cleaned_data_en.csv')

In [18]:
category_tcn = data_tcn['category'].unique()
category_en = data_en['category'].unique()

print(category_tcn)
print(len(category_tcn))

print(category_en)
print(len(category_en))

['Health & Beauty' 'Game Kingdom' 'Life & Entertainment' 'Mother & Baby'
 "Men's Apparel" "Women's Apparel" 'Women Bags' 'Women Shoes'
 'Home & Living' 'Everything Else' 'Hardware & 3C'
 "Men's Bags& Accessories" 'Sports & Outdoors' 'Helpbuy' 'Men Shoes'
 'Books' 'Motors' 'Home Electronic' 'Food & Beverages' 'Mobile & Gadgets'
 'Women Accessories' 'Pets' 'Tickets & Services' nan ' Mobile & Gadgets'
 'X']
26
['Hobbies & Stationery' 'Health & Personal Care' 'Mobile Accessories'
 'Home & Living' "Women's Apparel" 'Women Shoes' 'Babies & Kids'
 'Women Accessories' 'Toys, Games & Collectibles' 'Groceries' 'Motors'
 'Makeup & Fragrances' "Women's Bags" "Men's Apparel" 'Pet Care'
 "Men's Bags & Accessories" 'Sports & Travel' 'Men Shoes' 'Gaming'
 'Laptops & Computers' 'Home Entertainment' 'Mobiles & Gadgets' 'Cameras'
 'Home Appliances' 'Consumer Electronics' ' Games & Collectibles"'
 'Digital Goods & Vouchers']
27


In [19]:
temp = list(set(category_en)-set(category_tcn))
same_category = list(set(category_en)-set(temp))
same_category

['Women Shoes',
 "Women's Apparel",
 "Men's Apparel",
 'Men Shoes',
 'Motors',
 'Women Accessories',
 'Home & Living']

In [20]:
filtered_en = data_en[data_en.category.isin(same_category)]
filtered_en.head()

Unnamed: 0,product_title,category
4,flat sheetblanket cotton,Home & Living
5,korean set,Women's Apparel
6,highgrade keychain,Home & Living
7,【cod】chanel blackwhite sneaker shoes for women,Women Shoes
8,cat eyeglasses,Women's Apparel


In [21]:
filtered_tcn = data_tcn[data_tcn.category.isin(same_category)]
filtered_tcn.head()

Unnamed: 0,product_title,category
4,耐吉官 男子 足球 長 褲 新款 標準 型 拒水 拉鏈 褲腳,Men's Apparel
5,火影 忍道 鸣 睡衣 卡卡 西宇智 波佐助 恤 次 元 动漫 短袖 衣服,Women's Apparel
9,拇指 鞋坊 金標 貝殼 頭紅時 尚 男女 鞋板 鞋 時尚 情侶 鞋 男鞋 女鞋,Women Shoes
10,居家 推款 記憶 棉枕護 勁 頸 椎 枕修 復 脊椎 治頸椎 專枕頭 女孕婦 失眠 保健 理療 枕,Home & Living
14,炫彩 細口手 沖 壺桃紅 藍 黃,Home & Living


# TRADITIONAL CHINESE TRANSLATION TO ENGLISH USING GOOGLE API

In [22]:
import googletrans, time
from googletrans import Translator
import nltk

In [23]:
filtered_tcn['product_title']

4                            耐吉官 男子 足球 長 褲 新款 標準 型 拒水 拉鏈 褲腳
5                      火影 忍道 鸣 睡衣 卡卡 西宇智 波佐助 恤 次 元 动漫 短袖 衣服
9                   拇指 鞋坊 金標 貝殼 頭紅時 尚 男女 鞋板 鞋 時尚 情侶 鞋 男鞋 女鞋
10        居家 推款 記憶 棉枕護 勁 頸 椎 枕修 復 脊椎 治頸椎 專枕頭 女孕婦 失眠 保健 理療 枕
14                                         炫彩 細口手 沖 壺桃紅 藍 黃
                                ...                        
499986                                                   牙膏
499988                                                棒球 外套
499992                           創意 臺 燈學生 學習 燈 充電 臺 燈 充電夜 燈
499994      男裝 夏季 男裝 青少 少年 青少年 短袖 恤色 修身 韓版圓 領純 棉 衣服 袖 男夏 潮流
499995                                                Dress
Name: product_title, Length: 224598, dtype: object

In [24]:
dataset = filtered_tcn['product_title']
dataset

4                            耐吉官 男子 足球 長 褲 新款 標準 型 拒水 拉鏈 褲腳
5                      火影 忍道 鸣 睡衣 卡卡 西宇智 波佐助 恤 次 元 动漫 短袖 衣服
9                   拇指 鞋坊 金標 貝殼 頭紅時 尚 男女 鞋板 鞋 時尚 情侶 鞋 男鞋 女鞋
10        居家 推款 記憶 棉枕護 勁 頸 椎 枕修 復 脊椎 治頸椎 專枕頭 女孕婦 失眠 保健 理療 枕
14                                         炫彩 細口手 沖 壺桃紅 藍 黃
                                ...                        
499986                                                   牙膏
499988                                                棒球 外套
499992                           創意 臺 燈學生 學習 燈 充電 臺 燈 充電夜 燈
499994      男裝 夏季 男裝 青少 少年 青少年 短袖 恤色 修身 韓版圓 領純 棉 衣服 袖 男夏 潮流
499995                                                Dress
Name: product_title, Length: 224598, dtype: object

In [None]:
start = time.time()
translation_result = []
translator= Translator()
for i in tqdm(range(len(dataset))):
    translation_result.append(translator.translate(dataset.iloc[i], dest='en', src='zh-tw').text)
stop = time.time()

In [52]:
translated = pd.DataFrame({'pred': translation_result,'category':filtered_tcn['category'][0:len(translation_result)]})
# print("Test Time: ",stop-start)
print(translated.head())
print(translated.tail())
# translated.to_csv('D:/A Shopee Code League/#4 Title Translation/translated_to_en_cleaned_data_tcn.csv', index=False)

                                                 pred         category
4   Nike official men's football trousers new stan...    Men's Apparel
5   Naruto Shinobi pajamas Kaka Nishi Uchiha Sasuk...  Women's Apparel
9   Thumb shoe square gold label shell shell red w...      Women Shoes
10  Home Push Memory Memory Cotton Pillow Protecti...    Home & Living
14  Dazzling fine-mouthed hand punching pot pink b...    Home & Living
                                                   pred           category
3860                  Striped suspender skirt two-piece    Women's Apparel
3861                           Custom order merchandise  Women Accessories
3862             Collar character split sleeve clothing    Women's Apparel
3863  Postcard Practice Doll Zhu Zhengting Periphera...      Home & Living
3864  Back Butterfly Love Heart Low Waist Panties Sm...    Women's Apparel


# SIMILARITY CHECKING TO MATCH UP THE TRAINING SET

In [26]:
zh_translated = pd.read_csv('D:/A Shopee Code League/#4 Title Translation/translated_to_en_cleaned_data_tcn.csv')
all_sentences = filtered_en.dropna().reset_index().drop('index', axis=1)
# all_sentences = all_sentences[:25000]
all_sentences.head()

Unnamed: 0,product_title,category
0,flat sheetblanket cotton,Home & Living
1,korean set,Women's Apparel
2,highgrade keychain,Home & Living
3,【cod】chanel blackwhite sneaker shoes for women,Women Shoes
4,cat eyeglasses,Women's Apparel


In [27]:
import math
import re
from collections import Counter

WORD = re.compile(r"\w+")

def get_cosine(v1, v2):
    intersects = set(v1.keys()) & set(v2.keys())
    num = sum([v1[x] * v2[x] for x in intersects])
    sum1 = sum([v1[x] ** 2 for x in list(v1.keys())])
    sum2 = sum([v2[x] ** 2 for x in list(v2.keys())])
    den = math.sqrt(sum1) * math.sqrt(sum2)
    if not den:
        return 0.0
    else:
        return round(float(num) / den,3)

def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

In [28]:
all_sentences_vectors = []
for k in tqdm(range(len(all_sentences))):
    all_sentences_vectors.append(text_to_vector(all_sentences['product_title'][k]))

100%|██████████| 208111/208111 [00:04<00:00, 42507.75it/s]


In [29]:
all_sample_vectors = []
for n in tqdm(range(len(zh_translated['pred']))):
    all_sample_vectors.append(text_to_vector(zh_translated['pred'][n]))

100%|██████████| 1669/1669 [00:00<00:00, 30414.24it/s]


##### Check Five Sentences in Translated Traditional Chinese Dataset

In [30]:
doc = []
for j in tqdm(range(50,60)):
    cosine_similarity = []
    vector1 = all_sample_vectors[j]
    for i in range(len(all_sentences)):
        vector2 = all_sentences_vectors[i]
        cosine_similarity.append(get_cosine(vector1, vector2))
    doc.append([zh_translated['pred'][j], all_sentences['product_title'][cosine_similarity.index(max(cosine_similarity))], max(cosine_similarity)])

print('Cosine Similarity calculation completed')

100%|██████████| 10/10 [00:22<00:00,  2.23s/it]

Cosine Similarity calculation completed





In [31]:
doc

[['Sa🐷', ' flat sheetblanket cotton', 0.0],
 ['Fluorescent green shape pocket vest', 'vest', 0.447],
 ["Classic Recommended New Fans Spring New Love Women's Shoes Casual Skateboard Skateboard Shoes Low-Top Canvas Cloth Shoes Canvas Shoes",
  's',
  0.16],
 ['Aiwo Double Bed Package Set Cash Goods Double Bed Single Piece Set Double Bed Package Twin Bed Single Bed',
  ' flat sheetblanket cotton',
  0.0],
 ['Lace lingerie', 'lingerie', 0.707],
 ['Fans complex ancient middle help canvas vulcanization board shoes classic when it is still versatile leisure shoes men and women sports shoes running shoes canvas cloth shoes canvas shoes',
  'canvas shoes',
  0.789],
 ['Sale Uli twill cloth black grass green yellow green ruler', 'green', 0.577],
 ['Quality streamer flag shell head complex Gu Xiu leisure leather men and women shoes',
  'women and men shoes ',
  0.535],
 ['Short-sleeved shirt short high pounds pure cotton wild collar collar leisure leisure men and women models couple models short 