In [2]:
!conda install py_ nltk -q

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [4]:
# Standard Libraries
import pandas as pd
import numpy as np
import json

# Data Preprocessing & NLP
import nltk
import re
import string
import gensim
from textblob import Word

import xgboost as xgb
from xgboost import XGBClassifier

from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, AdaBoostClassifier, BaggingRegressor, GradientBoostingClassifier,BaggingClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics.pairwise import linear_kernel

# Performance metrics
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, make_scorer, roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics.pairwise import cosine_similarity

# Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
sns.set()
%matplotlib inline

pd.set_option('display.max_rows', 20)

import warnings
warnings.filterwarnings('ignore')

import datetime
today = datetime.date.today().strftime("%d-%m-%Y")
print(today)

17-02-2021


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Yo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
paths = 'data/hs_code.xlsx'
def get_master(sheets):
    data = pd.read_excel(paths,sheet_name= sheets)
    #data['section'] = data['section'].map('{:02}'.format)
    data = data[['section','description']]
    data['description'] = data['description'].str.lower()
 #   data = data.rename(columns={'heading' : 'target', 'product_desc' : 'question_text'})
    data.columns = ['category', 'text']
    
    return data

In [6]:
sheets = '8_digit'
hs_8 = get_master(sheets)
sheets = '6_digit'
hs_6 = get_master(sheets)
sheets = '4_digit'
hs_4 = get_master(sheets)
sheets = '2_digit'
hs_2 = get_master(sheets)
#sheets = 'imp'
#imp = get_master(sheets)
#sheets = 'exp'
#exp = get_master(sheets)
df = pd.concat([hs_8,hs_6,hs_4,hs_2], ignore_index=True)
df

XLRDError: Excel xlsx file; not supported

In [65]:
hs_df = df.drop_duplicates()
hs_df

Unnamed: 0,category,text
0,1,"horses; live, purebred breeding animals - pure..."
1,1,"horses; live, other than purebred breeding ani..."
2,1,asses; live - other
3,1,mules and hinnies; live- other
4,1,"cattle; live, purebred breeding animals - pure..."
...,...,...
115925,18,sleeping tester
115926,17,gem car vin:52cg2sga0d0003886
115927,15,sbf pin
115928,16,sbf sheave


In [66]:
hs_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113370 entries, 0 to 115929
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   category  113370 non-null  int64 
 1   text      113369 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.6+ MB


In [67]:
hs_df['category'].value_counts()

15    46124
16    29899
7     13262
18     5285
6      3903
      ...  
8       291
3       208
14      149
19       49
21       19
Name: category, Length: 21, dtype: int64

In [55]:
# Associate Category names with numerical index and save it in new column category_id
# Associate Category names with numerical index and save it in new column category_id
hs_df['category_id'] = hs_df['category'].factorize()[0]
hs_df

Unnamed: 0,category,text,category_id
0,01,"horses; live, purebred breeding animals - pure...",0
1,01,"horses; live, other than purebred breeding ani...",0
2,01,asses; live - other,0
3,01,mules and hinnies; live- other,0
4,01,"cattle; live, purebred breeding animals - pure...",0
...,...,...,...
115925,18,sleeping tester,17
115926,17,gem car vin:52cg2sga0d0003886,16
115927,15,sbf pin,14
115928,16,sbf sheave,15


In [56]:
# Create a new pandas dataframe "category_id_df", which only has unique Categories, also sorting this list in order of category_id values
category_id_df = hs_df[['category', 'category_id']].drop_duplicates().sort_values('category_id')
category_id_df

Unnamed: 0,category,category_id
0,01,0
571,02,1
1057,03,2
1230,04,3
1700,05,4
...,...,...
9100,17,16
9914,18,17
10240,19,18
10270,20,19


In [57]:
# Create a dictionary ( python datastructure - like a lookup table) that 
# can easily convert category names into category_ids and vice-versa
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'category']].values)

id_to_category

{0: '01',
 1: '02',
 2: '03',
 3: '04',
 4: '05',
 5: '06',
 6: '07',
 7: '08',
 8: '09',
 9: '10',
 10: '11',
 11: '12',
 12: '13',
 13: '14',
 14: '15',
 15: '16',
 16: '17',
 17: '18',
 18: '19',
 19: '20',
 20: '21'}

In [58]:
category_id_df

Unnamed: 0,category,category_id
0,01,0
571,02,1
1057,03,2
1230,04,3
1700,05,4
...,...,...
9100,17,16
9914,18,17
10240,19,18
10270,20,19


In [68]:
#Plot the distribution of news articles by category
hs_df.groupby('category').category_id.count().plot.bar(ylim=0,figsize=(15,8))

AttributeError: 'DataFrameGroupBy' object has no attribute 'category_id'

In [60]:
# Drop duplicate data
hs_df.drop_duplicates(subset=['category', 'text'], inplace=True)

In [61]:
# Data Cleaning
def clean_text(text):
    # remove everything except alphabets
    text = re.sub("[^a-zA-Z]", " ", text)
    # remove whitespaces
    text = ' '.join(text.split())
    text = text.lower()
    
    return text

In [62]:
# creating clean text feature
hs_df['clean_text'] = hs_df['text'].apply(clean_text).str.replace('bn bn ', '')

# creating clean text feature
hs_df['clean_text'] = hs_df['text'].apply(clean_text).str.replace(' bn ', '')

TypeError: expected string or bytes-like object

In [44]:
def random_color_func(word=None, font_size=None, position=None,
                      orientation=None, font_path=None, random_state=None):
    h = int(360.0 * 55.0 / 255.0)
    s = int(100.0 * 255.0 / 255.0)
    l = int(100.0 * float(random_state.randint(70, 120)) / 255.0)
    return "hsl({}, {}%, {}%)".format(h, s, l)

def freq_words(x, terms = 30):
    all_words = ' '.join([text for text in x])
    all_words = all_words.split()
    
    freq_dist = nltk.FreqDist(all_words)
    words_df = pd.DataFrame({'word':list(freq_dist.keys()), 'count':list(freq_dist.values())})
    
    fig = plt.figure(figsize=(21,16))
    ax1 = fig.add_subplot(2,1,1)
    wordcloud = WordCloud(width=1000, height=300, background_color='black', 
                          max_words=1628, relative_scaling=1,
                          color_func = random_color_func,
                          normalize_plurals=False).generate_from_frequencies(freq_dist)
    
    ax1.imshow(wordcloud, interpolation="bilinear")
    ax1.axis('off')
    
    # select top 20 most frequent word
    ax2 = fig.add_subplot(2,1,2)
    d = words_df.nlargest(columns="count", n = terms) 
    ax2 = sns.barplot(data=d, palette = sns.color_palette('BuGn_r'), x= "count", y = "word")
    ax2.set(ylabel= 'Word')
    plt.show()

In [45]:
# plot 25 most frequent words including stop words
freq_words(hs_df['clean_text'], 25)

KeyError: 'clean_text'

In [2]:
types = "section"
#types = "chapter"

print("Load the dataset: Section")
t0 = time()

sheets = '8_digit'
eights = get_master(sheets,types)
sheets = '6_digit'
sixs = get_master(sheets,types)
sheets = '4_digit'
fours = get_master(sheets,types)
sheets = '2_digit'
twos = get_master(sheets,types)

sheets = 'imp'
tests = get_master(sheets,types)

sheets = 'exp'
decl = get_master(sheets,types)

data = pd.concat([eights,sixs,fours,twos,tests,decl], ignore_index=True)
master = manage_data(data)


#twenty_test = manage_data(tests)
load_time = time() - t0
print("Load dataset time:  %0.3fs" % load_time)
print(len(master))
master.head()

Load the dataset: Section
Load dataset time:  168.814s
179892


Unnamed: 0,target,data
0,1,"horses; live, purebred breeding animals - pure..."
1,1,"horses; live, other than purebred breeding ani..."
2,1,asses; live - other
3,1,mules and hinnies; live- other
4,1,"cattle; live, purebred breeding animals - pure..."


In [4]:
from collections import Counter
Counter(master["target"])

Counter({'01': 987,
         '02': 893,
         '03': 247,
         '04': 855,
         '05': 1572,
         '06': 6019,
         '07': 18813,
         '08': 451,
         '09': 507,
         '10': 1210,
         '11': 5143,
         '12': 795,
         '13': 1004,
         '14': 159,
         '15': 78696,
         '16': 45671,
         '17': 6595,
         '18': 8440,
         '19': 56,
         '20': 1755,
         '21': 24})

In [5]:
df = master.drop_duplicates()
df

Unnamed: 0,target,data
0,01,"horses; live, purebred breeding animals - pure..."
1,01,"horses; live, other than purebred breeding ani..."
2,01,asses; live - other
3,01,mules and hinnies; live- other
4,01,"cattle; live, purebred breeding animals - pure..."
...,...,...
179887,20,"wire brush, hand, with wood handle"
179888,10,wypall l20 perforated jumbo roll wiper 2 polls
179889,07,"yellow tape 2"" x 10 ytd"
179890,06,zinc it crc


## Save Dataframe

In [6]:
joblib.dump(df, 'data/df_section.pkl', compress=1)
print('success')

success


In [26]:
import re 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

def create_model(df,names):
    X = []
    for i in range(len(df)):
        X.append(clean_str(df.iloc[i][1]))
    
    y = np.array(df["target"])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=5)



    print("pipeline of feature engineering and model")
    model = Pipeline([('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

    print("paramater selection")


    parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}

    print("Training: ")
   
    t0 = time()
    gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
    gs_clf_svm = gs_clf_svm.fit(X, y)
    print(gs_clf_svm.best_score_)
    print(gs_clf_svm.best_params_)

    model = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

#fit model with training data
    model.fit(X_train, y_train)

    pred = model.predict(X_test)

    print(accuracy_score(y_test, pred))

#save the model
    print("Save Model")

    joblib.dump(model, 'data/model/model_'+names +'.pkl', compress=1)
    print("Create Model.. "+names+" success")


    print (classification_report(y_test, pred))

In [27]:
import re
def clean_string(string,chars=None):
    """
    Cleans a ticker for easier use throughout MoneyTree

    Splits by space and only keeps first bit. Also removes
    any characters that are not letters. Returns as lowercase. / uppercase

    >>> clean_ticker('^VIX')
    'vix'
    >>> clean_ticker('SPX Index')
    'spx'
    """
    pattern = re.compile('[\W_+,\s]')
    #res = pattern.sub('', ticker.split(' ')[0])
   
    res = pattern.sub('', string)
    #print(res)
    if chars =='upper':
        return res.upper()
    else:
        return res.lower()
  



    def clean_strings(strings):
        """
        Maps clean_ticker over tickers. (แยก ticker ออกมาทีละ 1 digit)
        """
        return [clean_string(x) for x in strings]

In [28]:
def create_main_model( types):
    data = ""
    t0 = time()
    a = []
    i = 0
    paths = 'data/df_section.pkl'
    df = load_df(paths)
    #df = joblib.load('data/train/df_master_train.pkl')
    print("download data...") 
       
    df = df[[types, 'description']]   
        
       
    df.columns = ['target', 'data']
    n = df.isnull().sum()
    print("missing values : ", n )
    df.dropna(inplace=True)
    df['target'] = df['target'].apply(int)
    print("Load Data.. "+types+" success")
        
        #save the data train
    print("Save data train")
    create_model(df,types)
    
    load_time = time() - t0
    print("Load dataset time:  %0.3fs" % load_time)


Load DataFrame...
Load data success


In [23]:
data

Unnamed: 0,target,data
0,01,"horses; live, purebred breeding animals - pure..."
1,01,"horses; live, other than purebred breeding ani..."
2,01,asses; live - other
3,01,mules and hinnies; live- other
4,01,"cattle; live, purebred breeding animals - pure..."
...,...,...
179887,20,"wire brush, hand, with wood handle"
179888,10,wypall l20 perforated jumbo roll wiper 2 polls
179889,07,"yellow tape 2"" x 10 ytd"
179890,06,zinc it crc


In [24]:
import re 
def clean_str(string):
    """
    Tokenization/string cleaning for dataset
    Every dataset is lower cased except
    """
    string = re.sub(r"\n", "", string)    
    string = re.sub(r"\r", "", string) 
    string = re.sub(r"[0-9]", "digit", string)
    string = re.sub(r"\'", "", string)    
    string = re.sub(r"\"", "", string)    
    return string.strip().lower()

In [25]:
%%time

print("train test split dataset")
#train test split
df = data.copy()
from sklearn.model_selection import train_test_split
X = []
for i in range(df.shape[0]):
    X.append(clean_str(df.iloc[i][1]))
y = np.array(df["target"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

train test split dataset


TypeError: expected string or bytes-like object

In [12]:
#feature engineering and model selection
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


print("Training: ")

#pipeline of feature engineering and model
t0 = time()
model = Pipeline([('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

print("paramater selection")
#paramater selection
from sklearn.model_selection import GridSearchCV
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}

gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X, y)
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

#preparing the final pipeline using the selected parameters
print("preparing the final pipeline using the selected parameters")
model = Pipeline([('vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', OneVsRestClassifier(LinearSVC(class_weight="balanced")))])

#fit model with training data
print("fit model with training data")
model.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)

Training: 
paramater selection


NameError: name 'y' is not defined

In [None]:
#evaluation on test data
t0 = time()
pred = model.predict(X_test)
test_time = time() - t0
print("test time:  %0.3fs" % test_time)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(pred, y_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
from sklearn.metrics import classification_report
print (classification_report(y_test, pred))

In [None]:
#save the model
print("Save Model")
import joblib
joblib.dump(model, 'model/model_section.pkl', compress=1)