In [7]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tokenizer import porter_stem_tokenizer
from tokenizer import snowball_stem_tokenizer
from tokenizer import lancaster_stem_tokenizer
from tokenizer import wordnet_lemma_tokenizer
from tokenizer import wordnet_lemma_pos_tokenizer

In [2]:
ROOT_PATH = os.getcwd()

with open(f'{ROOT_PATH}/data/datastore/article_titles_plus_contents_all.txt', mode='r', encoding='utf-8') as file:
  raw_contents = file.read().splitlines()

with open(f'{ROOT_PATH}/data/target/article_categories_all.txt', mode='r', encoding='utf-8') as file:
  target = file.read().splitlines()

raw_df = pd.DataFrame({
    'category': target,
    'content': raw_contents
})
raw_df

Unnamed: 0,category,content
0,technology,21st-Century Sports: How Digital Technology Is...
1,business,Asian quake hits European shares Shares in Eur...
2,technology,BT offers free net phone calls BT is offering ...
3,business,Barclays shares up on merger talk Shares in UK...
4,sport,Barkley fit for match in Ireland England centr...
...,...,...
1403,sport,Woodward eyes Brennan for Lions Toulouse's for...
1404,business,WorldCom trial starts in New York The trial of...
1405,business,Yukos accused of lying to court Russian oil fi...
1406,business,Yukos drops banks from court bid Russian oil c...


In [71]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

def tokenizing(tokenizer, X):
    vectorizer = TfidfVectorizer(tokenizer=tokenizer, min_df=.01)
    tf_idf = vectorizer.fit_transform(X)
    return tf_idf

def model_testing(tokenizers, models, X, Y):
    cv_result = {
        "model_name": [],
        "tokenizer_name": [],
        "score (default_params)": []
    }
    for tokenizer in tokenizers:
        tokenized = tokenizing(tokenizer, X)
        for model in models:
            cv_score = cross_val_score(model, tokenized, Y, cv=5, )
            cv_result["model_name"].append(type(model).__name__)
            cv_result["tokenizer_name"].append(tokenizer.__name__)
            cv_result["score (default_params)"].append(cv_score.mean())
    result_df = pd.DataFrame(cv_result).sort_values(by=['score (default_params)'], ascending=False)
    return result_df

def parameter_tuning(tokenizers, models, params, X, Y):
    tuning_result = {
        "model_name": [],
        "tokenizer_name": [],
        "best_parameter": [],
        "best_score": []
    }
    for tokenizer in tokenizers:
        tokenized = tokenizing(tokenizer, X)
        for model, param in zip(models, params.values()):
            clf = GridSearchCV(model, param, cv=5, n_jobs=-1, verbose=1)
            result = clf.fit(tokenized, Y)
            tuning_result['model_name'].append(type(model).__name__)
            tuning_result['tokenizer_name'].append(tokenizer.__name__)
            tuning_result['best_score'].append(result.best_score_)
            tuning_result['best_parameter'].append(result.best_params_)
    tuning_result = pd.DataFrame(tuning_result).sort_values(by=['best_score'], ascending=False)
    return tuning_result

def compare_result(untune_result, tuned_result):
    compare_result = pd.merge(untune_result,
                                tuned_result,
                                how='inner',
                                on=['model_name', 'tokenizer_name']
                                )[['model_name', 'tokenizer_name', 'score (default_params)', 'best_score', 'best_parameter']]
    compare_result['variance'] = compare_result['best_score'] - compare_result['score (default_params)']
    return compare_result


In [70]:
# Define tokenizer

# tokenizers = [porter_stem_tokenizer, snowball_stem_tokenizer,
#               lancaster_stem_tokenizer, wordnet_lemma_tokenizer, 
#               wordnet_lemma_pos_tokenizer]

tokenizers = [porter_stem_tokenizer, snowball_stem_tokenizer,]

# Define model
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
models = [KNeighborsClassifier(), RandomForestClassifier()]

# Define parameter
params ={
    "knn_params":{
        'n_neighbors' : list(range(1, 16))
    },
    "r2f_params":{
        'max_depth': list(range(50, 60, 2)),
        'min_samples_split': list(range(2, 10, 2))
    }
}

In [56]:
X = raw_df['content'].values
Y = raw_df['category']

In [72]:
# run pipline
test_result = model_testing(tokenizers, models, X, Y)
tuned_result = parameter_tuning(tokenizers, models, params, X, Y)
compare_result(test_result, tuned_result)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 15 candidates, totalling 75 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits


Unnamed: 0,model_name,tokenizer_name,score (default_params),best_score,best_parameter,variance
0,RandomForestClassifier,porter_stem_tokenizer,0.970869,0.975132,"{'max_depth': 50, 'min_samples_split': 8}",0.004263
1,RandomForestClassifier,snowball_stem_tokenizer,0.96944,0.975132,"{'max_depth': 50, 'min_samples_split': 6}",0.005691
2,KNeighborsClassifier,porter_stem_tokenizer,0.966616,0.96733,{'n_neighbors': 7},0.000714
3,KNeighborsClassifier,snowball_stem_tokenizer,0.965907,0.968751,{'n_neighbors': 7},0.002844
