# Part 2 Count-based Embeddings + ML Classifier

## Utils

In [1]:
import pandas as pd
import numpy as np

np.random.seed(0)  # reproducibility

In [2]:
def submit_to_scv(pred, 
                  sample_csv='data/test_a_sample_submit.csv', 
                  path='submissions/sub.csv'):
    
    sub = pd.read_csv(sample_csv, index_col=False)
    sub['label'] = pred
    sub.to_csv(path, index=False)

## Load data

In [3]:
%%time

train_df = pd.read_csv('data/train_set.csv', sep='\t', index_col=False)
test_df = pd.read_csv('data/test_a.csv', sep='\t', index_col=False)

CPU times: user 7.44 s, sys: 954 ms, total: 8.39 s
Wall time: 8.5 s


In [4]:
train_text, test_text = train_df['text'], test_df['text']
corpus = pd.concat([train_text, test_text], ignore_index=True)

## Word Embeddings

A text is reduced to the **base vocabulary** (after *tokenization*, *stopword removal*, *stemming*, *lemmatization*, *indexing*) it uses, a **representation**.

* [One-hot](http://localhost:8888/notebooks/tianchi-531810/part2_TF-IDF_ml.ipynb#One-hot)
* [N-gram](http://localhost:8888/notebooks/tianchi-531810/part2_TF-IDF_ml.ipynb#N-gram)
* [Bag-of-Words (BoW)](http://localhost:8888/notebooks/tianchi-531810/part2_TF-IDF_ml.ipynb#Bag-of-Words-(BoW))
* [TF-IDF](http://localhost:8888/notebooks/tianchi-531810/part2_TF-IDF_ml.ipynb#TF-IDF)
* [Latent Semantic Analysis (LSA)](http://localhost:8888/notebooks/tianchi-531810/part2_TF-IDF_ml.ipynb#Latent-Semantic-Analysis-(LSA))
* [Word2Vec](https://en.wikipedia.org/wiki/Word2vec)
* [GloVe](https://en.wikipedia.org/wiki/GloVe_(machine_learning))
* [fastText](https://en.wikipedia.org/wiki/FastText)

In [5]:
STOP_WORDS = None
#STOP_WORDS = ['3750', '648', '900']
NGRAM_RANGE=(1, 1)  # only unigram / BoW
#NGRAM_RANGE=(1, 2)

### [One-hot](https://en.wikipedia.org/wiki/One-hot)

### [N-gram](https://en.wikipedia.org/wiki/N-gram)

We can view Bag-of-Words as a special case of the $n$-gram, with $n=1$.

### [Bag-of-Words (BoW)](https://en.wikipedia.org/wiki/Bag-of-words_model)

In [6]:
%%time

from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(
    stop_words=STOP_WORDS,
    token_pattern=r'\w{1,}',
    ngram_range=NGRAM_RANGE
)

count_vectorizer.fit(corpus)  # avoid OOV
train_count_features = count_vectorizer.transform(train_text)  # .shape: (N_train, V)
test_count_features = count_vectorizer.transform(test_text)    # .shape: (N_test, V)

CPU times: user 4min 4s, sys: 3.89 s, total: 4min 7s
Wall time: 4min 9s


### [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)

Term frequency–inverse document frequency (TF-IDF), one of the most popular term-weighting schemes today, gives a term-based sparse representation.

In [7]:
%%time

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    stop_words=STOP_WORDS,
    token_pattern=r'\w{1,}',
    ngram_range=NGRAM_RANGE
)

tfidf_vectorizer.fit(corpus)  # avoid OOV
train_tfidf_features = tfidf_vectorizer.transform(train_text)  # .shape: (N_train, V)
test_tfidf_features = tfidf_vectorizer.transform(test_text)    # .shape: (N_test, V)

CPU times: user 4min 4s, sys: 4.01 s, total: 4min 8s
Wall time: 4min 9s


### [Latent Semantic Analysis](https://en.wikipedia.org/wiki/Latent_semantic_analysis) (LSA)

Term-based representation is reduced to a concept-based one through Singular Value Decomposition (SVD, a "non-square" PCA).

## Linear Model

### Logistic regression (LR)

In [8]:
%%time

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

lr_clf = LogisticRegression(
    n_jobs=-1,
    random_state=42
)

X, y = train_tfidf_features.toarray(), train_df['label'].to_numpy()
X_test = test_tfidf_features.toarray()

X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42
)


lr_clf.fit(X_train[:10000], y_train[:10000])
y_pred_lr = lr_clf.predict(X_val)
score = f1_score(y_val, y_pred_lr, average='macro')

print(score)

0.8453524000909389
CPU times: user 12.2 s, sys: 15.2 s, total: 27.3 s
Wall time: 2min 21s


### Ridge classifier

In [11]:
%%time

from sklearn.linear_model import RidgeClassifier

ridge_clf = RidgeClassifier(
    random_state=42
)

ridge_clf.fit(X_train[:10000], y_train[:10000])
y_pred_ridge = ridge_clf.predict(X_val)
score = f1_score(y_val, y_pred_ridge, average='macro')

print(score)

0.8609733200311974
CPU times: user 22.5 s, sys: 1.28 s, total: 23.8 s
Wall time: 8.04 s


### Linear SVM

In [10]:
%%time

from sklearn.linear_model import SGDClassifier

svm_clf = SGDClassifier(
    n_jobs=-1,
    random_state=42
)

svm_clf.fit(X_train[:10000], y_train[:10000])
y_pred_svm = svm_clf.predict(X_val)
score = f1_score(y_val, y_pred_svm, average='macro')

print(score)

0.8717562197605281
CPU times: user 28.4 s, sys: 84 ms, total: 28.5 s
Wall time: 4.14 s


## Gradient Boosting

### XGBoost

In [None]:
import xgboost as xgb



### LightGBM

In [None]:
import lightgbm as lgb

