In [1]:
import numpy as np
import pandas as pd

In [2]:
# Read csv
df = pd.read_csv("./data/test_raw_data.csv", dtype={"name": str, "code": str}, encoding="utf-8")
df

Unnamed: 0,name,code
0,tools of wood,4417
1,broom of wood,4417
2,brush bodies of wood,4417
3,handles of wood,4417
4,tool bodies of wood,4417
5,tool handles of wood,4417
6,hat-forms of felt,6501
7,hat bodies of felt,6501
8,hoods of felt,6501
9,manchons of felt,6501


In [3]:
# Select code
df2 = df.loc[(df["code"].str.len() > 5)].reset_index(drop=True)
df2

Unnamed: 0,name,code
0,beret,65010000
1,cap stuffing,65010000
2,cavalier's hats,65010000
3,felt hat body,65010000
4,felt hats preform,65010000
5,rabbit down felt semi-finished cap,65010000
6,wool felt hat felt,65010000
7,wool felt semi-finished cap,65010000
8,wool fflt hat body,65010000
9,aluminium alloy outline border hollow glass,70052100


In [4]:
# Update slice code
df2.code = df2.code.str.slice(stop=6)
df2

Unnamed: 0,name,code
0,beret,650100
1,cap stuffing,650100
2,cavalier's hats,650100
3,felt hat body,650100
4,felt hats preform,650100
5,rabbit down felt semi-finished cap,650100
6,wool felt hat felt,650100
7,wool felt semi-finished cap,650100
8,wool fflt hat body,650100
9,aluminium alloy outline border hollow glass,700521


In [5]:
# Replace name
df2.name = df2.name.str.replace(r"[.]{3}|[^a-zA-Z\- ]+", " ").str.replace(r"[\.\-]|\b([a-z]{1})\b", "", regex=True)\
                        .str.replace(r"[ ]{2,}", " ").str.strip()
df2

Unnamed: 0,name,code
0,beret,650100
1,cap stuffing,650100
2,cavalier hats,650100
3,felt hat body,650100
4,felt hats preform,650100
5,rabbit down felt semifinished cap,650100
6,wool felt hat felt,650100
7,wool felt semifinished cap,650100
8,wool fflt hat body,650100
9,aluminium alloy outline border hollow glass,700521


In [6]:
# Drop white space name
df2 = df2[(df2.name != '') & (df2.name != ' ')].reset_index(drop=True)
df2

Unnamed: 0,name,code
0,beret,650100
1,cap stuffing,650100
2,cavalier hats,650100
3,felt hat body,650100
4,felt hats preform,650100
5,rabbit down felt semifinished cap,650100
6,wool felt hat felt,650100
7,wool felt semifinished cap,650100
8,wool fflt hat body,650100
9,aluminium alloy outline border hollow glass,700521


In [7]:
# Count duplicated data about name with code
df2 = df2.groupby(["name", "code"]).agg('size').reset_index(name="counts")
# df2 = df2.groupby(["name", "code"]).size().reset_index(name="counts")
df2

Unnamed: 0,name,code,counts
0,aa alkaline battery,850610,10
1,aa brace,730890,16
2,aa bracket,730890,14
3,aa fresh royal jelly,410004,13
4,aa superenergy adhesive,350691,9
5,aaa alkaline battery,850610,11
6,aac,761490,47
7,aac,854442,12
8,aac,854449,55
9,aaluminium side window for vehicle,870829,16


In [8]:
# Extract max count row
df2 = df2.iloc[df2.groupby("name")["counts"].idxmax()]
df2

Unnamed: 0,name,code,counts
0,aa alkaline battery,850610,10
1,aa brace,730890,16
2,aa bracket,730890,14
3,aa fresh royal jelly,410004,13
4,aa superenergy adhesive,350691,9
5,aaa alkaline battery,850610,11
8,aac,854449,55
9,aaluminium side window for vehicle,870829,16
10,aanimal agentia,300490,4
11,ab adhesive,350691,37


In [9]:
# Seperate and make train, test dataframes
from sklearn.model_selection import train_test_split
train, test = train_test_split(df2, test_size=0.2)

In [10]:
train

Unnamed: 0,name,code,counts
94252,cotton nitrilon yarn,540239,11
353675,sheet pulp machine,844140,7
242208,magic spotted granite,680293,12
299469,plastic barricades,392690,4
396192,telecommunication fittings,851770,115
196192,highefficiency energysaving silicon carbide ba...,690390,24
40468,biochemical tube,701790,19
35216,bath ball,392111,122
50631,bulrush curtain,460129,65
166961,fullclosed maintenancefree battery case,850790,13


In [11]:
test

Unnamed: 0,name,code,counts
205928,hypersensitivity metal detector,854370,3
331900,red rosewood whole set cabinet,940360,11
40229,bimetallic thermometer,902519,32
352014,settop box sharer,847141,36
412669,twolamp uv drier,841939,14
315613,precision smelting tool,846610,44
219776,jacquard maillot without sash,610990,101
203085,household quick sharpener,850980,6
101561,dairy food film,392099,39
2067,acidity deoiling agent,340290,7


In [15]:
# Extract dataframe to csv
train.to_csv("./data/train_file.txt", sep='\t', encoding='utf-8', header=False, index=False, columns=["name", "code"])
test.to_csv("./data/test_file.txt", sep='\t', encoding='utf-8', header=False, index=False, columns=["name", "code"])

## Saving

In [2]:
# Delete one column and save
'''
df = pd.read_csv("./data/test_raw_data.csv")
df = df.drop(["Unnamed: 0"], axis=1)
df.to_csv("./data/test_raw_data.csv", index=False)
'''

'\ndf = pd.read_csv("./data/test_raw_data.csv")\ndf = df.drop(["Unnamed: 0"], axis=1)\ndf.to_csv("./data/test_raw_data.csv", index=False)\n'

In [12]:
# For test
# df2.loc[df2["name"].str.contains("dioxyanth", na=False)]

Unnamed: 0,name,code
349314,1-2-dioxyanthraquinone,291469
1365317,dioxyanthraquinone,294200
2085845,dioxyanthraquinone,294200
3921937,1-2-dioxyanthraquinone,291469
6721377,1-2-dioxyanthraquinone,291469
7473146,dioxyanthraquinone,294200
8635622,1-2-dioxyanthraquinone,291469


In [8]:
# PREP lambda
df2 = df2.groupby(["name", "code"]).agg({"counts": "sum"})
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,counts
name,code,Unnamed: 2_level_1
aa alkaline battery,850610,10
aa brace,730890,16
aa bracket,730890,14
aa fresh royal jelly,410004,13
aa superenergy adhesive,350691,9
aaa alkaline battery,850610,11
aac,761490,47
aac,854442,12
aac,854449,55
aaluminium side window for vehicle,870829,16


In [9]:
# Calcurate code percentage in same name
df2 = df2.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

In [15]:
df2.index.name

In [9]:
# Show same name with different code
df2[df2.duplicated(["name"], keep=False)]

Unnamed: 0,name,code,counts
6,aac,761490,47
7,aac,854442,12
8,aac,854449,55
17,ab glue,350691,132
18,ab glue,350699,34
45,abamectin,293399,23
46,abamectin,380891,20
52,abamectin powder,300420,49
53,abamectin powder,300490,12
100,abnormal elbow pipe,391740,6


In [44]:
# Search contain specific string
df2[df2.name.str.contains("[^a-z ()-0-9#+]", na=False, regex=True)]

Unnamed: 0,name,code,counts
0,#422 maleic modified resin,390750,7
1,%long persistence luminous powder,320650,48
2,%steel angled bar,722240,15
3,'sun' photographic lamp,940540,6
6,(+ or -)-1-c2-(amido)-1-(4-anisole group) ethy...,294200,1
7,(2-chlormoethyl cyanide) aether,290949,21
8,(380cs coil spring) modified anti-explosion pl...,870893,3
18,"(r)- (+)-1,1'-bi-2-naphthol",294200,12
20,"(r)-6,7-dihydroxyl geraniol benzene ammonia me...",294200,11
23,(s)4-bromo-3-hydroxy butanoate,294200,3


In [58]:
# Search contain specific string
df2[df2.name.str.contains("[.]{3}|[#]+", na=False, regex=True)]

Unnamed: 0,name,code,counts
0,#422 maleic modified resin,390750,7
50,0.25 ....25 grade vernier gauge,902620,1
52,0.9% sodium ...z)-2-hydroxyimino-2-(2-aminothi...,300490,2
55,000# lubricating grease,271019,18
135,"1,4-dioxaspiro[4.5...[1, 2-a]-pyrimidin-4-one",294200,9
136,"1,4-dioxaspiro[4.5]...-4-one",294200,1
137,"1,4-dioxaspiro[4.5]...-pyrido[1,2-a]pyrimidin-...",294200,1
216,1-st.2-nd sleeve and ...tin bushing,848390,1
218,1-st.2-nd sleeve and hub... flywheel case,848390,1
219,1-st.2-nd sleeve and hub...foreign trade shaft,848390,1


In [26]:
# Select rows by index
df2.iloc[2:10]

Unnamed: 0,name,code,counts
2,%steel angled bar,722240,15
3,'sun' photographic lamp,940540,6
4,( massage ) antiskid mat for bath use,401691,30
5,( tower ) crane,842620,5
6,(+ or -)-1-c2-(amido)-1-(4-anisole group) ethy...,294200,1
7,(2-chlormoethyl cyanide) aether,290949,21
8,(380cs coil spring) modified anti-explosion pl...,870893,3
9,(bops) plastic positive air-pressure thermal f...,847740,1


In [55]:
df2.name.str.replace("[^a-zA-Z ]", " ")

0                                     maleic modified resin
1                          long persistence luminous powder
2                                          steel angled bar
3                                    sun  photographic lamp
4                       massage   antiskid mat for bath use
5                                             tower   crane
6            or      c   amido       anisole group  ethy...
7                              chlormoethyl cyanide  aether
8             cs coil spring  modified anti explosion pl...
9          bops  plastic positive air pressure thermal f...
10               export type  fetus  multiparameter monitor
11            high resistant  magnetic stripe reader writer
12                    industrial grade  sodium fluosilicate
13                          industrial grade  sulfuric acid
14         low temperature  steady state damp heat test box
15                                           p  anisic acid
16         phase compunded self excitati