In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tokenizer import porter_stem_tokenizer
from tokenizer import snowball_stem_tokenizer
from tokenizer import lancaster_stem_tokenizer
from tokenizer import wordnet_lemma_tokenizer
from tokenizer import wordnet_lemma_pos_tokenizer

In [2]:
ROOT_PATH = os.getcwd()

with open(f'{ROOT_PATH}/data/datastore/article_titles_plus_contents_all.txt', mode='r', encoding='utf-8') as file:
  raw_contents = file.read().splitlines()

with open(f'{ROOT_PATH}/data/target/article_categories_all.txt', mode='r', encoding='utf-8') as file:
  target = file.read().splitlines()

raw_df = pd.DataFrame({
    'category': target,
    'content': raw_contents
})
raw_df

Unnamed: 0,category,content
0,technology,21st-Century Sports: How Digital Technology Is...
1,business,Asian quake hits European shares Shares in Eur...
2,technology,BT offers free net phone calls BT is offering ...
3,business,Barclays shares up on merger talk Shares in UK...
4,sport,Barkley fit for match in Ireland England centr...
...,...,...
1403,sport,Woodward eyes Brennan for Lions Toulouse's for...
1404,business,WorldCom trial starts in New York The trial of...
1405,business,Yukos accused of lying to court Russian oil fi...
1406,business,Yukos drops banks from court bid Russian oil c...


In [4]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

def tokenizing(tokenizer, X):
    vectorizer = TfidfVectorizer(tokenizer=tokenizer, min_df=.01)
    tf_idf = vectorizer.fit_transform(X)
    return tf_idf

def model_testing(tokenizers, models, X_train, Y_train):
    cv_result = {
        "model_name": [],
        "tokenizer_name": [],
        "score (default_params)": []
    }
    for tokenizer in tokenizers:
        tokenized = tokenizing(tokenizer, X_train)
        for model in models:
            cv_score = cross_val_score(model, tokenized, Y_train, cv=5, )
            cv_result["model_name"].append(type(model).__name__)
            cv_result["tokenizer_name"].append(tokenizer.__name__)
            cv_result["score (default_params)"].append(cv_score.mean())
    result_df = pd.DataFrame(cv_result).sort_values(by=['score (default_params)'], ascending=False)
    return result_df

def parameter_tuning(tokenizers, models, params, X_train, Y_train):
    tuning_result = {
        "model_name": [],
        "tokenizer_name": [],
        "best_parameter": [],
        "best_score": []
    }
    for tokenizer in tokenizers:
        tokenized = tokenizing(tokenizer, X_train)
        for model, param in zip(models, params.values()):
            clf = GridSearchCV(model, param, cv=5, n_jobs=-1, verbose=1)
            result = clf.fit(tokenized, Y_train)
            tuning_result['model_name'].append(type(model).__name__)
            tuning_result['tokenizer_name'].append(tokenizer.__name__)
            tuning_result['best_score'].append(result.best_score_)
            tuning_result['best_parameter'].append(result.best_params_)
    tuning_result = pd.DataFrame(tuning_result).sort_values(by=['best_score'], ascending=False)
    return tuning_result

def compare_result(untune_result, tuned_result):
    compare_result = pd.merge(untune_result,
                                tuned_result,
                                how='inner',
                                on=['model_name', 'tokenizer_name']
                                )[['model_name', 'tokenizer_name', 'score (default_params)', 'best_score', 'best_parameter']]
    compare_result['variance'] = compare_result['best_score'] - compare_result['score (default_params)']
    return compare_result.sort_values(by=['best_score'], ascending=False, ignore_index=True)


In [10]:
# Define tokenizer

tokenizers = [porter_stem_tokenizer, snowball_stem_tokenizer,
              lancaster_stem_tokenizer, wordnet_lemma_tokenizer, 
              wordnet_lemma_pos_tokenizer]

# Define model
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

models = [KNeighborsClassifier(), LogisticRegression()]

# Define parameter
params ={
    "knn_params":{
        'n_neighbors' : list(range(1, 16))
    },
    # "r2f_params":{
    #     'max_depth': list(range(50, 60, 2)),
    #     'min_samples_split': list(range(2, 10, 2))
    # },
    "logis_params":{
        "solver" : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        "C": list(np.arange(0.1, 1.1, 0.1))
    }
}

In [11]:
X = raw_df['content'].values
Y = raw_df['category']

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X ,Y, random_state=50, test_size=.33, stratify=Y)

In [14]:
# run pipline
test_result = model_testing(tokenizers, models, X_train, y_train)
tuned_result = parameter_tuning(tokenizers, models, params, X_train, y_train)


Fitting 5 folds for each of 15 candidates, totalling 75 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 15 candidates, totalling 75 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 15 candidates, totalling 75 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 15 candidates, totalling 75 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Fitting 5 folds for each of 15 candidates, totalling 75 fits
Fitting 5 folds for each of 50 candidates, totalling 250 fits


Unnamed: 0,model_name,tokenizer_name,score (default_params),best_score,best_parameter,variance
0,LogisticRegression,wordnet_lemma_pos_tokenizer,0.984088,0.984088,"{'C': 0.9, 'solver': 'newton-cg'}",0.0
1,LogisticRegression,porter_stem_tokenizer,0.983029,0.983029,"{'C': 0.8, 'solver': 'saga'}",0.0
2,LogisticRegression,snowball_stem_tokenizer,0.983029,0.983029,"{'C': 0.8, 'solver': 'newton-cg'}",0.0
3,LogisticRegression,lancaster_stem_tokenizer,0.980913,0.980913,"{'C': 0.7000000000000001, 'solver': 'saga'}",0.0
4,LogisticRegression,wordnet_lemma_tokenizer,0.980907,0.980907,"{'C': 0.6, 'solver': 'newton-cg'}",0.0
5,KNeighborsClassifier,lancaster_stem_tokenizer,0.955454,0.96925,{'n_neighbors': 7},0.013796
6,KNeighborsClassifier,snowball_stem_tokenizer,0.959704,0.968192,{'n_neighbors': 13},0.008488
7,KNeighborsClassifier,porter_stem_tokenizer,0.959693,0.968186,{'n_neighbors': 13},0.008494
8,KNeighborsClassifier,wordnet_lemma_pos_tokenizer,0.957582,0.968186,{'n_neighbors': 15},0.010605
9,KNeighborsClassifier,wordnet_lemma_tokenizer,0.956535,0.967123,{'n_neighbors': 11},0.010588


In [17]:
result = compare_result(test_result, tuned_result)
result

Unnamed: 0,model_name,tokenizer_name,score (default_params),best_score,best_parameter,variance
0,LogisticRegression,wordnet_lemma_pos_tokenizer,0.984088,0.984088,"{'C': 0.9, 'solver': 'newton-cg'}",0.0
1,LogisticRegression,porter_stem_tokenizer,0.983029,0.983029,"{'C': 0.8, 'solver': 'saga'}",0.0
2,LogisticRegression,snowball_stem_tokenizer,0.983029,0.983029,"{'C': 0.8, 'solver': 'newton-cg'}",0.0
3,LogisticRegression,lancaster_stem_tokenizer,0.980913,0.980913,"{'C': 0.7000000000000001, 'solver': 'saga'}",0.0
4,LogisticRegression,wordnet_lemma_tokenizer,0.980907,0.980907,"{'C': 0.6, 'solver': 'newton-cg'}",0.0
5,KNeighborsClassifier,lancaster_stem_tokenizer,0.955454,0.96925,{'n_neighbors': 7},0.013796
6,KNeighborsClassifier,snowball_stem_tokenizer,0.959704,0.968192,{'n_neighbors': 13},0.008488
7,KNeighborsClassifier,porter_stem_tokenizer,0.959693,0.968186,{'n_neighbors': 13},0.008494
8,KNeighborsClassifier,wordnet_lemma_pos_tokenizer,0.957582,0.968186,{'n_neighbors': 15},0.010605
9,KNeighborsClassifier,wordnet_lemma_tokenizer,0.956535,0.967123,{'n_neighbors': 11},0.010588


In [39]:
from sklearn.metrics import classification_report, accuracy_score

def apply_best_method(best_model, best_tokenizer, best_params, X, Y):
    X = tokenizing(best_tokenizer, X)
    X_train, X_test, y_train, y_test = train_test_split(X ,Y, random_state=50, test_size=.33, stratify=Y)
    model = best_model.set_params(**best_params)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    result = classification_report(y_test, pred)
    print(accuracy_score(y_test, pred))
    print(result)

In [40]:
def map_model(model_name):
    for model in models:
        if model_name == type(model).__name__:
            return model

def map_tokenizer(tokenizer_name):
    for tokenizer in tokenizers:
        if tokenizer_name == tokenizer.__name__:
            return tokenizer

best_model = map_model(result.loc[0, 'model_name'])
best_tokenizer = map_tokenizer(result.loc[0, 'tokenizer_name'])
best_params = result.loc[0, 'best_parameter']

In [41]:
apply_best_method(best_model, best_tokenizer, best_params, X, Y)

0.9741935483870968
              precision    recall  f1-score   support

    business       0.97      0.97      0.97       162
       sport       0.99      0.99      0.99       174
  technology       0.96      0.96      0.96       129

    accuracy                           0.97       465
   macro avg       0.97      0.97      0.97       465
weighted avg       0.97      0.97      0.97       465

