In [1]:
pip install opencc

Collecting opencc
  Downloading OpenCC-1.1.9-cp311-cp311-manylinux2014_x86_64.whl.metadata (13 kB)
Downloading OpenCC-1.1.9-cp311-cp311-manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: opencc
Successfully installed opencc-1.1.9


In [None]:
import pandas as pd
import numpy as np
import re
import jieba
from google.colab import drive
drive.mount('/content/drive')

train_tcn = pd.read_csv('/content/drive/My Drive/Shopee-Product-Title-Translation_ZH-EN/datasets/train_tcn.csv')
dev_tcn = pd.read_csv('/content/drive/My Drive/Shopee-Product-Title-Translation_ZH-EN/datasets/dev_tcn.csv')
test_tcn = pd.read_csv('/content/drive/My Drive/Shopee-Product-Title-Translation_ZH-EN/datasets/test_tcn.csv')

In [None]:
category_tcn = train_tcn.category.unique()

print(category_tcn)
print(len(category_tcn))

['Health & Beauty' 'Game Kingdom' 'Life & Entertainment' 'Mother & Baby'
 "Men's Apparel" "Women's Apparel" 'Women Bags' 'Women Shoes'
 'Home & Living' 'Everything Else' 'Hardware & 3C'
 "Men's Bags& Accessories" 'Sports & Outdoors' 'Helpbuy' 'Men Shoes'
 'Books' 'Motors' 'Home Electronic' 'Food & Beverages' 'Mobile & Gadgets'
 'Women Accessories' 'Pets' 'Tickets & Services' nan]
24


In [None]:
train_tcn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   product_title  499999 non-null  object
 1   category       499982 non-null  object
dtypes: object(2)
memory usage: 7.6+ MB


## Data Preprocessing

1. punctuation removal
2. non-chinese word removal
3. numbers removal
4. emoji/special characters removal
5. tokenization

In [None]:
import opencc

# Initialize the converter
converter = opencc.OpenCC('s2t')  # 's2t' means Simplified to Traditional

def convert_to_traditional(text):
    try:
      x = converter.convert(text)
    except:
      x = text
    return x

train_tcn['product_title'] = train_tcn['product_title'].apply(convert_to_traditional)
dev_tcn['text'] = dev_tcn['text'].apply(convert_to_traditional)
test_tcn['text'] = test_tcn['text'].apply(convert_to_traditional)

In [None]:
# remove punctuation, non chinese words, special characters
def non_chinese_removal(line):
    p1 = re.compile(r'''~!@#$%^&*()_\-+=<>?:"{}|,.\/;'\[]·~！@#￥%……&*（）——\-+={}|「『《》？：“”【】、；‘'，。、''')
    p2 = re.compile(r'[^\u4e00-\u9fa5]')

    line = p1.sub(r' ',str(line))
    line = p2.sub(r' ',str(line))

    return line

In [None]:
train_tcn['product_title_clean'] = train_tcn['product_title'].apply(non_chinese_removal)
dev_tcn['product_title_clean'] = dev_tcn['text'].apply(non_chinese_removal)
test_tcn['product_title_clean'] = test_tcn['text'].apply(non_chinese_removal)

In [None]:
train_tcn['product_title_clean'][10]

'居家大推款記憶棉枕護勁頸椎枕修復脊椎治頸椎專用枕頭女孕婦失眠保健理療枕'

In [None]:
def tokenize_chinese(title):
    wordlist = jieba.cut(title) # there are 3 tokenization mode for jieba, after testing, precision mode is better for our case
    wl_space_split = " ".join(wordlist)
    return wl_space_split

In [None]:
train_tcn['product_title_tokenized'] = train_tcn['product_title_clean'].apply(tokenize_chinese)
dev_tcn['product_title_processed'] = dev_tcn['product_title_clean'].apply(tokenize_chinese)
test_tcn['product_title_processed'] = test_tcn['product_title_clean'].apply(tokenize_chinese)

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.902 seconds.
DEBUG:jieba:Loading model cost 0.902 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


In [None]:
train_tcn['product_title_tokenized'][10]

'居家 大 推款 記憶 棉枕護 勁 頸 椎 枕修 復 脊椎 治頸椎 專用 枕頭 女孕婦 失眠 保健 理療 枕'

In [None]:
dev_tcn['product_title_processed']

Unnamed: 0,product_title_processed
0,手機殼 軟殼 掛繩 殼 大眼 兔 硅 膠殼
1,鍍膜車 蠟 強力 撥水型
2,低糖 芒果乾 臻 御行
3,小徑 文化 日本 進口
4,凱蒂貓 涼鞋 童鞋 白紅色 小童
...,...
995,撕破 樂趣 獨特風格 中 直筒 牛仔 褲
996,兒童 套裝 臺 灣 製 薄 長 袖 居家 套裝 魔法
997,高密 尼龍 後 揹 包 中型
998,開口 可 調節 戒指


In [None]:
test_tcn['product_title_processed'][:5]

Unnamed: 0,product_title_processed
0,美麗諾 羊毛 保暖 襪淺 灰
1,甜蜜 水晶 天然 水晶 五行 珠手 鍊 手 鍊
2,粉晶 六角 柱純 銀項 鍊
3,超強力 雙面 膠戶 外 專用
4,燈 專屬 優惠盒


In [None]:
train_tcn.to_csv('/content/drive/My Drive/Shopee-Product-Title-Translation_ZH-EN/preprocessed/cleaned_train_tcn.csv', index = False)
dev_tcn.to_csv('/content/drive/My Drive/Shopee-Product-Title-Translation_ZH-EN/preprocessed/cleaned_dev_tcn.csv', index = False)
test_tcn.to_csv('/content/drive/My Drive/Shopee-Product-Title-Translation_ZH-EN/preprocessed/cleaned_test_tcn.csv', index = False)

### Avoid Stopword Removal:

1. Machine Translation

2. Language Modeling

3. Text Summarization

4. Question-Answering problems