In [1]:
import os

import pandas as pd

from preprocessing.preprocessing import preprocess


DATASET_FOLDER = "data"
CACHE_FOLDER = "cache"

os.makedirs(DATASET_FOLDER, exist_ok=True)
os.makedirs(CACHE_FOLDER, exist_ok=True)

[nltk_data] Downloading package wordnet to /Users/victor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/victor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# STEP 1: Cleaning the dataset

In [2]:
print("STEP 1: Cleaning the dataset...")

# take the preprocessed dateset in the cache folder
try:
    print("\t-> Loading cached dataset")
    X_train = pd.read_csv(
        f"{CACHE_FOLDER}/X_train_preprocessed.csv", index_col="Unnamed: 0")
    Y_train = pd.read_csv(
        f"{CACHE_FOLDER}/Y_train_preprocessed.csv", index_col="Unnamed: 0")

# If the dataset is not found
except:
    print("\t-> File not found, generating preprocessed datasets")
    # Load normal datasets
    X_train = pd.read_csv(
        f"{DATASET_FOLDER}/X_train_update.csv", index_col="Unnamed: 0")
    Y_train = pd.read_csv(
        f"{DATASET_FOLDER}/Y_train_CVw08PX.csv", index_col="Unnamed: 0")

    # preprocess datasets
    X_train, Y_train = preprocess(X_train, Y_train)

    # save preprocessed datasets
    X_train.to_csv(f"{CACHE_FOLDER}/X_train_preprocessed.csv")
    Y_train.to_csv(f"{CACHE_FOLDER}/Y_train_preprocessed.csv")

print("\t-> Done")

STEP 1: Cleaning the dataset...
	-> Loading cached dataset
	-> Done


# STEP 2: Sentences embedding

In [3]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn import ensemble, tree
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer()
tfidf = TfidfVectorizer()

X_bow = bow.fit_transform(X_train['designation'])
X_tfidf = tfidf.fit_transform(X_train['designation'])


train_x, valid_x, train_y, valid_y = model_selection.train_test_split(
    X_tfidf, Y_train['prdtypecode'])

encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

# STEP 3: Finding best hyper-parameters

### Finetuning MultinomialNB

In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix

if False:
    clf=naive_bayes.MultinomialNB()


    param_grid = {"alpha" : [0, 0.05, .1, .2, 1.0]}

    skf = StratifiedKFold(n_splits=5)
    grid_search = GridSearchCV(clf, param_grid, scoring="f1_weighted",
                               cv=skf, return_train_score=True)

    grid_search.fit(train_x, train_y)

    # make the predictions
    y_pred = grid_search.predict(valid_x)
    weighted_f1 = metrics.f1_score(valid_y, y_pred, average='weighted')
    print(" -> Weighted F1 score:", weighted_f1)

    print('Best params:')
    print(grid_search.best_params_)

### Finetuning LogisticRegression

In [5]:
if False:
    clf = linear_model.LogisticRegression()

    param_grid = {
        "penalty": ["l2"],
        "C": [1, 1.5, 2, 4, 8]
    }

    skf = StratifiedKFold(n_splits=5)
    grid_search = GridSearchCV(clf, param_grid, scoring="f1_weighted",
                               cv=skf, return_train_score=True)

    grid_search.fit(train_x, train_y)

    # make the predictions
    y_pred = grid_search.predict(valid_x)
    weighted_f1 = metrics.f1_score(valid_y, y_pred, average='weighted')
    print(" -> Weighted F1 score:", weighted_f1)

    print('Best params:')
    print(grid_search.best_params_)

### Finetuning AdaBoost Classifier

In [6]:
if False:
    
    clf=ensemble.AdaBoostClassifier()

    param_grid = {
        "base_estimator" : [tree.DecisionTreeClassifier(max_depth=2)],
        "n_estimators": [1200, 2000, 3000, 5000],
        "learning_rate": [.1]
    }


    skf = StratifiedKFold(n_splits=5)
    grid_search = GridSearchCV(clf, param_grid, scoring="f1_weighted",
                               cv=skf, return_train_score=True)

    grid_search.fit(train_x, train_y)

    # make the predictions
    y_pred = grid_search.predict(valid_x)
    weighted_f1 = metrics.f1_score(valid_y, y_pred, average='weighted')
    print(" -> Weighted F1 score:", weighted_f1)

    print('Best params:')
    print(grid_search.best_params_)


### Finetuning RandomForest Classifier

In [7]:
if False:

    clf = ensemble.RandomForestClassifier()

    param_grid = {
        'min_samples_split': [2],
        'n_estimators': [500, 1000, 2000]
    }

    skf = StratifiedKFold(n_splits=5)
    grid_search = GridSearchCV(clf, param_grid, scoring="f1_weighted",
                               cv=skf, return_train_score=True)

    grid_search.fit(train_x, train_y)

    # make the predictions
    y_pred = grid_search.predict(valid_x)
    weighted_f1 = metrics.f1_score(valid_y, y_pred, average='weighted')
    print(" -> Weighted F1 score:", weighted_f1)

    print('Best params:')
    print(grid_search.best_params_)

### Finetuning GradientBoostingClassifier

In [8]:
if False:
    
    clf=ensemble.GradientBoostingClassifier()

    param_grid = {
        "n_estimators": [50, 100, 200, 400],
        "max_depth": [3, 4],
        "learning_rate": [.1, .05]
    }


    skf = StratifiedKFold(n_splits=5)
    grid_search = GridSearchCV(clf, param_grid, scoring="f1_weighted",
                               cv=skf, return_train_score=True)

    grid_search.fit(train_x, train_y)

    # make the predictions
    y_pred = grid_search.predict(valid_x)
    weighted_f1 = metrics.f1_score(valid_y, y_pred, average='weighted')
    print(" -> Weighted F1 score:", weighted_f1)

    print('Best params:')
    print(grid_search.best_params_)

# STEP 4: Printing scores

In [9]:
import xgboost

model_list = [ensemble.RandomForestClassifier(min_samples_split=3,
                                              n_estimators=200,
                                              max_depth=None,
                                              max_features=5),
              linear_model.LogisticRegression(C=2, penalty="l2"),
              naive_bayes.MultinomialNB(alpha=0.1),
              ensemble.GradientBoostingClassifier(
                  n_estimators=150, learning_rate=0.2, max_depth=3),
              ensemble.AdaBoostClassifier(n_estimators=200), # You should put 2000 for .43 in F1
              xgboost.XGBClassifier()]

model_names = ["Random Forest",
               "Logistic Regression",
               "Multinomial Naive Bayes",
               "Gradient Boosting",
               "AdaBoost",
               "XGBoost"]

for model, model_name in zip(model_list, model_names):
    print("Testing:", model_name)
    model.fit(train_x, train_y)
    y_pred = model.predict(valid_x)
    weighted_f1 = metrics.f1_score(valid_y, y_pred, average='weighted')
    print(" -> Weighted F1 score:", weighted_f1)

Testing: Random Forest
 -> Weighted F1 score: 0.7947380460721855
Testing: Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


 -> Weighted F1 score: 0.7951697334731598
Testing: Multinomial Naive Bayes
 -> Weighted F1 score: 0.7483833056290212
Testing: Gradient Boosting
 -> Weighted F1 score: 0.7374350643018278
Testing: AdaBoost
 -> Weighted F1 score: 0.09482472011859888
Testing: XGBoost


KeyboardInterrupt: 