In [2]:
import sys


sys.path.append("../")

In [10]:
from classifier.constants import (
    TENNIS_ITEMS,
    FOOTBALL_ITEMS,
    BASKETBALL_ITEMS,
    WINTER_SPORT_ITEMS,
    VOLLEYBALL_ITEMS,
    HOCKEY_ITEMS,
    ATHLETICS_ITEMS,
    ESPORT_ITEMS,
    BOARDGAMES_ITEMS,
    MOTOSPORT_ITEMS,
    AUTOSPORT_ITEMS,
    EXTREME_ITEMS,
    MARTIAL_ARTS_ITEMS,
    CATEGORY_TO_ITEMS
)

In [19]:
import string
import pandas as pd

from tqdm import tqdm
from typing import List, Dict, Set
from classifier.voter import Voter
from sklearn.metrics import accuracy_score
from classifier.factory import build_evaluator
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
tqdm.pandas()

In [23]:
# CATEGORY_NAME = "tennis"
# evaluator = build_evaluator(CATEGORY_NAME, TENNIS_ITEMS)
# evaluator.compute_metrics(validation)
# evaluator.compute_false_negative(validation)

In [11]:
mapping = {
    "k": "к",
    "ё": "е",
    "e": "е",
    "a": "а",
    "o": "о",
    "p": "р"
}

def replace(text: str, mapping: Dict[str, List[str]] = mapping) -> str:
    for k, v in mapping.items():
        text = text.replace(k, v)
    return text


def text_has_oov(text: str, vocab: Set[str]):
    for char in text:
        if char not in vocab:
            return True
    return False


def preprocess(text: str) -> str:
    text = text.lower()
    text = replace(text)
    tokens = text.split()
    tokens = [token.strip(string.punctuation) for token in tokens]
    tokens = [token for token in tokens if "id" not in token]
    tokens = [token for token in tokens if not token.isdigit()]
    tokens = [token for token in tokens if text_has_oov(token, string.punctuation + "–")]
    return " ".join(tokens)

In [12]:
dataset = pd.read_csv("../data/train.csv")

In [13]:
train, validation = train_test_split(dataset, test_size=0.1, random_state=0)

In [25]:
voter = Voter(category_to_items=CATEGORY_TO_ITEMS, default_category="unknown", default_rank=1)

In [26]:
train = voter.vote(train)

100%|███████████████████████████████████| 34866/34866 [00:13<00:00, 2655.47it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["category_predicted"] = categories
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["category_predicted_rank"] = ranks


In [27]:
validation = voter.vote(validation)

100%|█████████████████████████████████████| 3874/3874 [00:01<00:00, 2675.28it/s]


In [14]:
train["text_preprocessed"] = train["text"].progress_apply(preprocess)

100%|██████████████████████████████████| 34866/34866 [00:01<00:00, 21290.91it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["text_preprocessed"] = train["text"].progress_apply(preprocess)


In [15]:
validation["text_preprocessed"] = validation["text"].progress_apply(preprocess)

100%|████████████████████████████████████| 3874/3874 [00:00<00:00, 19513.73it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation["text_preprocessed"] = validation["text"].progress_apply(preprocess)


In [32]:
# score of reg exp classifier
accuracy_score(
    y_true=validation[validation.category_predicted != "unknown"].category, 
    y_pred=validation[validation.category_predicted != "unknown"].category_predicted
)

0.8152173913043478

## Train second stage model

In [33]:
vectorizer = TfidfVectorizer()
vectorizer.fit(train.text_preprocessed)

TfidfVectorizer()

In [34]:
model = LogisticRegression()

In [None]:
model.fit(vectorizer.transform(train.text_preprocessed).toarray(), train.category)

In [None]:
# model perfomance
accuracy_score(
    y_true=model.predict(vectorizer.transform(validation.text_preprocessed).toarray()),
    y_pred=validation.category
)

In [None]:
# Model perfomance where reg exp gives unknown
metrics.accuracy_score(
    y_true=validation[validation.category_predicted == "unknown"].category, 
    y_pred=model.predict(
        vectorizer.transform(
            validation[validation.category_predicted == "unknown"].text_preprocessed
        ),
    )
)

## Submit

In [None]:
hidden = pd.read_csv("../data/test.csv")
hidden["text_preprocessed"] = hidden["text"].apply(preprocess)
hidden["category"] = model.predict(vectorizer.transform(hidden.text_preprocessed))
hidden[["oid", "category"]].to_csv("../data/submission.csv", index=False)