In [1]:
import os

import pandas as pd

from preprocessing import preprocess


DATASET_FOLDER = "data"
CACHE_FOLDER = "cache"

os.makedirs(DATASET_FOLDER, exist_ok=True)
os.makedirs(CACHE_FOLDER, exist_ok=True)

[nltk_data] Downloading package wordnet to /Users/victor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/victor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# STEP 1: Cleaning the dataset

In [2]:
print("STEP 1: Cleaning the dataset...")

# take the preprocessed dateset in the cache folder
try:
    print("\t-> Loading cached dataset")
    X_train = pd.read_csv(
        f"{CACHE_FOLDER}/X_train_preprocessed.csv", index_col="Unnamed: 0")
    Y_train = pd.read_csv(
        f"{CACHE_FOLDER}/Y_train_preprocessed.csv", index_col="Unnamed: 0")

# If the dataset is not found
except:
    print("\t-> File not found, generating preprocessed datasets")
    # Load normal datasets
    X_train = pd.read_csv(
        f"{DATASET_FOLDER}/X_train_update.csv", index_col="Unnamed: 0")
    Y_train = pd.read_csv(
        f"{DATASET_FOLDER}/Y_train_CVw08PX.csv", index_col="Unnamed: 0")

    # preprocess datasets
    X_train, Y_train = preprocess(X_train, Y_train)

    # save preprocessed datasets
    X_train.to_csv(f"{CACHE_FOLDER}/X_train_preprocessed.csv")
    Y_train.to_csv(f"{CACHE_FOLDER}/Y_train_preprocessed.csv")

print("\t-> Done")

STEP 1: Cleaning the dataset...
	-> Loading cached dataset
	-> Done


# STEP 2: Sentences embedding

In [3]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn import ensemble
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer()
tfidf = TfidfVectorizer()

X_bow = bow.fit_transform(X_train['designation'])
X_tfidf = tfidf.fit_transform(X_train['designation'])


train_x, valid_x, train_y, valid_y = model_selection.train_test_split(
    X_tfidf, Y_train['prdtypecode'])

encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

# STEP 3: Testing models

In [4]:
import xgboost

model_list = [ensemble.RandomForestClassifier(),
              linear_model.LogisticRegression(),
              naive_bayes.MultinomialNB(),
              ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0),
              ensemble.AdaBoostClassifier(n_estimators=100),
              xgboost.XGBClassifier()]

model_names = ["Random Forest",
              "Logistic Regression",
              "Multinomial Naive Bayes",
              "Gradient Boosting",
              "AdaBoost",
              "XGBoost"]

for model, model_name in zip(model_list, model_names):
    print("Testing:", model_name)
    model.fit(train_x, train_y)
    y_pred = model.predict(valid_x)
    weighted_f1 = metrics.f1_score(valid_y, y_pred, average='weighted')
    print(" -> Weighted F1 score:", weighted_f1)


Testing: Random Forest
 -> Weighted F1 score: 0.7703011418124492
Testing: Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


 -> Weighted F1 score: 0.7860657159220855
Testing: Multinomial Naive Bayes
 -> Weighted F1 score: 0.7201507293631655
Testing: Gradient Boosting
 -> Weighted F1 score: 0.49148628613589374
Testing: AdaBoost
 -> Weighted F1 score: 0.13865339519873154
Testing: XGBoost
 -> Weighted F1 score: 0.7048817250289822
