In [1]:
import pandas as pd
import matplotlib.pylab as plt
import re
from sklearn.feature_extraction import text
from spellchecker import SpellChecker
from nltk.corpus import stopwords

## Data preprocess

In [2]:
# Read csv
hscode_df = pd.read_csv("./data/fta_hscode.csv", encoding="utf-8",
                       dtype={'HS_CODE': str, 'ITEM_NAME': str})
hscode_df.HS_CODE = hscode_df.HS_CODE.map(lambda x: '0' + x if len(x) < 10 else x)
hscode_df.ITEM_NAME = hscode_df.ITEM_NAME.str.lower()

In [3]:
# Slice code
def code_slice(df, column, n):
    print("Max Length of Original Code: ", hscode_df[column].map(len).max())
    print("Min Length of Original Code: ", hscode_df[column].map(len).min())
    df.HS_CODE = hscode_df[column].str.slice(stop=n)
    print("Sliced Length of Code: ", n)
    return df

In [4]:
def delete_blank_rows(df, column):
    origin_length = len(df)
    print("Number of Rows in Dataframe (Before): ", origin_length)
    df.dropna(subset=[column], inplace=True)
    df[column] = df[column].str.strip()
    df = df[df[column].map(lambda x: x!='')]
    print("Number of Deleted Rows in Dataframe: ", origin_length - len(df))
    print("Number of Rows in Dataframe (After): ", len(df))
    return df

In [5]:
def process_special_single_char_digital(df, column):
    df[column] = df[column].map(lambda x: ' '.join(re.findall(r"[a-z]{2,}", x)))
    return df

In [48]:
def delete_stop_words(df, column):
    en_stops = set(stopwords.words('english'))
    df[column] = df[column]\
        .map(lambda x: ' '.join([word for word in x.split() if not word in en_stops]))
    return df

In [49]:
def check_spell(df, column):
    spell = SpellChecker()
    df[column] = df[column]\
        .map(lambda x: ' '.join([spell.correction(word) for word in x.split()]))
    return df

In [71]:
spellchecked_df = check_spell(processed_df[:1000], 'ITEM_NAME')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [73]:
spellchecked_df

Unnamed: 0,HS_CODE,ITEM_NAME
0,8419909090,paper machine parts
1,8419909090,paraffin dissolved parts
2,8419909090,para shimmer
3,8419909090,pars for constant temperature and humidity cha...
4,8419909090,part
5,8419909090,part air preheated
6,8419909090,part cleaner
7,8419909090,part cleaner and parts
8,8419909090,part coffee roasted
9,8419909090,part for


In [None]:
hscode_df.loc[hscode_df.ITEM_NAME.str.contains()]

In [6]:
hscode_df = delete_blank_rows(hscode_df, 'ITEM_NAME')

Number of Rows in Dataframe (Before):  2074940
Number of Deleted Rows in Dataframe:  3360
Number of Rows in Dataframe (After):  2071580


In [7]:
# Check Special Char
hscode_df.loc[hscode_df.ITEM_NAME.str.contains("[^a-z ]")].head()

Unnamed: 0,HS_CODE,ITEM_NAME
5,8419909090,part(air preheater)
10,8419909090,part for 7000 pph modular ccr regen section
20,202201000,frozen beef cow short rib 3-7 iw
21,202201000,frozen beef c-short rib 7-11
22,202201000,frozen beef c shortribs 2-6r iw


In [8]:
processed_df = process_special_single_char_digital(hscode_df, "ITEM_NAME")

In [9]:
processed_df.iloc[[5, 10, 20, 21, 22]]

Unnamed: 0,HS_CODE,ITEM_NAME
5,8419909090,part air preheater
10,8419909090,part for pph modular ccr regen section
20,202201000,frozen beef cow short rib iw
21,202201000,frozen beef short rib
22,202201000,frozen beef shortribs iw


In [42]:
processed_df

Unnamed: 0,HS_CODE,ITEM_NAME
0,8419909090,paper machine parts
1,8419909090,paraffin dissolver parts
2,8419909090,para thimmer
3,8419909090,pars for constant temperature and humidity cha...
4,8419909090,part
5,8419909090,part air preheater
6,8419909090,part cleaner
7,8419909090,part cleaner and parts
8,8419909090,part coffee roaster
9,8419909090,part for


In [7]:
re.sub(r"[^a-z0-9 ]+", " ", "rubbing？strip,？rubbing？vrs2833")

'rubbing strip rubbing vrs2833'

In [32]:
hscode_df = delete_blank_rows(hscode_df, 'ITEM_NAME')
hscode_df.head()

Number of Rows in Dataframe (Before):  2071580
Number of Deleted Rows in Dataframe:  0
Number of Rows in Dataframe (After):  2071580


Unnamed: 0,HS_CODE,ITEM_NAME
0,8419909090,PAPER MACHINE PARTS
1,8419909090,PARAFFIN DISSOLVER PARTS
2,8419909090,PARA THIMMER
3,8419909090,PARS FOR CONSTANT TEMPERATURE AND HUMIDITY CHA...
4,8419909090,PART


In [None]:
hscode_df.loc[hscode_df.ITEM_NAME == 'WELL'].drop_duplicates('HS_CODE')

In [7]:
# Delete stopwords in ITEM_NAME
stop_words = [word.upper() for word in text.ENGLISH_STOP_WORDS]
hscode_df['ITEM_NAME_WITHOUT_STOPWORDS'] = hscode_df.ITEM_NAME.map(lambda x: ' '.join(word for word in str(x).upper().split() if word not in (stop_words)))
hscode_df.head()

Unnamed: 0,HS_CODE,ITEM_NAME,ITEM_NAME_WITHOUT_STOPWORDS
0,8419909090,PAPER MACHINE PARTS,PAPER MACHINE PARTS
1,8419909090,PARAFFIN DISSOLVER PARTS,PARAFFIN DISSOLVER PARTS
2,8419909090,PARA THIMMER,PARA THIMMER
3,8419909090,PARS FOR CONSTANT TEMPERATURE AND HUMIDITY CHA...,PARS CONSTANT TEMPERATURE HUMIDITY CHAMBER
4,8419909090,PART,


In [None]:
# Hscode count
hscode_count = hscode_df.groupby('HS_CODE').agg('count').rename(columns={'ITEM_NAME':'COUNT'}).sort_values('COUNT', ascending=False)
hscode_count.head()

In [None]:
hscode_count.sort_values('COUNT').plot.box()

In [None]:
hscode_df.loc[hscode_df.HS_CODE == '392690'].drop_duplicates('ITEM_NAME')

In [None]:
name_count = hscode_df.groupby('ITEM_NAME').agg('count').rename(columns={'HS_CODE':'COUNT'}).sort_values('COUNT', ascending=False)
name_count.head()

In [None]:
len(name_count)

In [None]:
name_count.plot.box()

In [None]:
name_count.describe()

In [None]:
hscode_df.groupby('HS_CODE').agg('count').plot.box()

In [None]:
hscode_df.groupby('HS_CODE').agg('count').sort_values('ITEM_NAME', ascending=False)

In [None]:
hscode_df.groupby('HS_CODE').agg('count').describe()

In [None]:
df3 = hscode_df.groupby('HS_CODE').agg('count')

In [None]:
iqr = df3.quantile(0.75) - df3.quantile(0.25)
upper_bound = df3.quantile(0.75) + 1.5*iqr
lower_bound = df3.quantile(0.25) - 1.5*iqr
print(lower_bound)
df3 = df3[(df3 < upper_bound) & (df3 > lower_bound) ]

In [None]:
df3.plot.box()

In [None]:
df3.describe()

In [None]:
hscode_df.groupby('HS_CODE').agg('count').sort_values('ITEM_NAME', ascending=False)[1100:].plot.box()

In [None]:
name_count.plot()

In [None]:
name_count[3:750000].plot()

In [None]:
len(name_count)

In [None]:
name_count.iloc[100000]

In [None]:
hscode_df.loc[hscode_df.ITEM_NAME == 'BUTEETRAPPECARBURANT']

In [None]:
name_code_group = hscode_df.groupby(['ITEM_NAME', 'HS_CODE']).agg('count').reset_index()
name_code_group.head()

In [None]:
name_code_group[name_code_group.ITEM_NAME.duplicated(keep=False)]

In [None]:
name_code_group[name_code_group.ITEM_NAME.duplicated(keep=False)].drop_duplicates('ITEM_NAME')

In [None]:
name_code_group[name_code_group.ITEM_NAME.duplicated(keep=False)].iloc[582422:-13].drop_duplicates('ITEM_NAME')

In [None]:
name_code_group[~name_code_group.ITEM_NAME.duplicated(keep=False)]

In [None]:
hscode_df.ITEM_NAME.str.upper().str.replace(r"[^A-Z ]", "", regex=True)

In [None]:
re_hscode_df = hscode_df
re_hscode_df.ITEM_NAME = re_hscode_df.ITEM_NAME.str.upper().str.replace(r"[^A-Z ]", "", regex=True)
re_hscode_df.head()

In [None]:
re_hscode_df.dropna(inplace=True)
re_hscode_df.loc[re_hscode_df.ITEM_NAME.isna()]

In [None]:
re_hscode_df.ITEM_NAME = re_hscode_df.ITEM_NAME.str.strip()

In [None]:
re_hscode_df.sort_values('ITEM_NAME').head()

In [None]:
re_hscode_df.drop(re_hscode_df[re_hscode_df.ITEM_NAME == '   '].index, inplace=True)
re_hscode_df.loc[re_hscode_df.ITEM_NAME == '']

In [None]:
re_hscode_df.groupby('HS_CODE').agg('count').sort_values('ITEM_NAME').plot()

In [None]:
re_hscode_df.groupby('HS_CODE').agg('count').sort_values('ITEM_NAME', ascending=False).head()

In [None]:
re_hscode_df.groupby(['HS_CODE', 'ITEM_NAME']).agg('count').reset_index()

In [None]:
re_hscode_df_name_count = re_hscode_df.groupby('ITEM_NAME').agg('count')\
                        .rename(columns={'HS_CODE': 'COUNT'}).reset_index()\
                        .sort_values('COUNT', ascending=False)
re_hscode_df_name_count.head()

In [None]:
re_hscode_df_name_count[1:100].plot(x='ITEM_NAME', y="COUNT")

In [None]:
# need to fix
re_hscode_df.groupby(['HS_CODE', 'ITEM_NAME']).agg('count').reset_index()

In [None]:
re_hscode_df.drop_duplicates(inplace=True)
re_hscode_df.head()

In [None]:
# Check above
re_hscode_df.loc[re_hscode_df.ITEM_NAME == 'PART'].sort_values('HS_CODE').head()

In [None]:
name_count[~name_count.index.str.upper().str.contains(r"[^A-Z ]")]

In [None]:
len(name_count)

In [None]:
test = hscode_df.ITEM_NAME.str.upper().str.contains(r"[^A-Z ]")

In [None]:
test.loc[(test != False) & (test != True)]

In [None]:
off_special_char = hscode_df[~test.fillna(True)]
off_special_char.head()

In [None]:
off_special_char.HS_CODE = off_special_char.HS_CODE.str.slice(stop=6)
off_special_char.head()

In [None]:
off_special_char.ITEM_NAME = off_special_char.ITEM_NAME.str.upper()

In [None]:
off_special_char.sort_values('ITEM_NAME')

### Make train, test set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(re_hscode_df, test_size=0.2)

In [None]:
train.ITEM_NAME.to_csv('train_name_fta_spcial_char_off.txt', index=False, header=False)

In [None]:
train.HS_CODE.to_csv('train_class_fta_spcial_char_off.txt', index=False, header=False)

In [None]:
test.ITEM_NAME.to_csv('test_name_fta_spcial_char_off.txt', index=False, header=False)

In [None]:
test.HS_CODE.to_csv('test_class_fta_spcial_char_off.txt', index=False, header=False)

In [None]:
train.sort_values('ITEM_NAME')

In [None]:
train, test = train_test_split(re_hscode_df, test_size=0.2)

In [None]:
train.ITEM_NAME.to_csv('train_name_fta.txt', index=False, header=False)
train.HS_CODE.to_csv('train_class_fta.txt', index=False, header=False)
test.ITEM_NAME.to_csv('test_name_fta.txt', index=False, header=False)
test.HS_CODE.to_csv('test_class_fta.txt', index=False, header=False)

In [None]:
train.ITEM_NAME.to_csv('train_name_fta_alpha_word.txt', index=False, header=False)
train.HS_CODE.to_csv('train_class_fta_alpha_word.txt', index=False, header=False)
test.ITEM_NAME.to_csv('test_name_fta_alpha_word.txt', index=False, header=False)
test.HS_CODE.to_csv('test_class_fta_alpha_word.txt', index=False, header=False)

## Check Stop Words

52036              hi name beatrice bunny
60906     mom chef autumn squash porridge
156711                       excepearl ce
Name: ITEM_NAME, dtype: object

In [27]:
en_stops

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [29]:
processed_df.loc[processed_df.ITEM_NAME.str.contains(r' she ')]

Unnamed: 0,HS_CODE,ITEM_NAME
241031,3303001000,ead she edp ml spray np
481836,4202121090,philips she st germain underground citiscape
606010,8211920000,sheffield she weapon
716405,8419509000,used she mk right defect
915877,8481201000,mt she hyundai
1200064,6211491000,cloth nobis she ra black
1294290,7210499010,flat steel she toyota
1435831,7613002000,superinsulated she dewar
1600643,8517626060,philips she earphone
1714738,9307000000,sheffield she weapon


## Make Unknown Words Dictionary

In [96]:
a = [i.split() for i in test_df.ITEM_NAME]
# b = [j.split() for j in a]

In [30]:
spell = SpellChecker()

In [45]:
spell.correction("runnin")

'running'