In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys


sys.path.append("../")

In [3]:
from classifier.constants import (
    TENNIS_ITEMS,
    FOOTBALL_ITEMS,
    BASKETBALL_ITEMS,
    WINTER_SPORT_ITEMS,
    VOLLEYBALL_ITEMS,
    HOCKEY_ITEMS,
    ATHLETICS_ITEMS,
    ESPORT_ITEMS,
    BOARDGAMES_ITEMS,
    MOTOSPORT_ITEMS,
    AUTOSPORT_ITEMS,
    EXTREME_ITEMS,
    MARTIAL_ARTS_ITEMS,
    CATEGORY_TO_ITEMS
)

In [4]:
import string
import pandas as pd
import numpy as np

from tqdm import tqdm
from typing import List, Dict, Set
from classifier.voter import Voter
from classifier.preprocessor import Preprocessor
from sklearn.metrics import accuracy_score
from classifier.factory import build_evaluator
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package stopwords to /home/zel1k7/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
tqdm.pandas()

In [6]:
dataset = pd.read_csv("../data/train.csv")

In [7]:
train, validation = train_test_split(dataset, test_size=0.1, random_state=0)

In [8]:
voter = Voter(category_to_items=CATEGORY_TO_ITEMS, default_category="unknown", default_rank=1)

In [9]:
train = voter.vote(train)

100%|███████████████████████████████████| 34866/34866 [00:08<00:00, 3974.54it/s]


In [10]:
validation = voter.vote(validation)

100%|█████████████████████████████████████| 3874/3874 [00:01<00:00, 3276.65it/s]


In [11]:
preprocessor = Preprocessor()

In [12]:
train["text_preprocessed"] = preprocessor.preprocess(train.text)

In [13]:
validation["text_preprocessed"] = preprocessor.preprocess(validation.text)

In [14]:
# score of reg exp classifier
accuracy_score(
    y_true=validation[validation.category_predicted != "unknown"].category, 
    y_pred=validation[validation.category_predicted != "unknown"].category_predicted
)

0.8152173913043478

## Train second stage model

In [15]:
vectorizer = TfidfVectorizer()
vectorizer.fit(train.text_preprocessed)

TfidfVectorizer()

In [16]:
model = LogisticRegression()

In [17]:
model.fit(vectorizer.transform(train.text_preprocessed), train.category)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [18]:
# model perfomance
accuracy_score(
    y_true=model.predict(vectorizer.transform(validation.text_preprocessed).toarray()),
    y_pred=validation.category
)

0.8440887971089314

In [19]:
# Model perfomance where reg exp gives unknown
accuracy_score(
    y_true=validation[validation.category_predicted == "unknown"].category, 
    y_pred=model.predict(
        vectorizer.transform(
            validation[validation.category_predicted == "unknown"].text_preprocessed
        ),
    )
)


0.7877030162412993

In [20]:
THRESHOLD = 0.65

## Submit

In [21]:
hidden = pd.read_csv("../data/test.csv")
hidden["text_preprocessed"] = preprocessor.preprocess(hidden.text)
hidden["predictions"] = hidden.text_preprocessed.progress_apply(
    lambda text: model.predict_proba(vectorizer.transform([text]))
)
hidden["predictions"] = hidden["predictions"].apply(lambda x: x[0])
hidden["category"] = hidden.predictions.apply(
    lambda predictions: model.classes_[np.argmax(predictions)] if max(predictions) >= THRESHOLD else None
)
hidden[~hidden.category.isna()][["oid", "category"]].to_csv("../data/submission.csv", index=False)

100%|████████████████████████████████████| 26260/26260 [01:29<00:00, 293.43it/s]
