## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Load Dataset

In [2]:
from datasets import load_stratified_dataset
df = load_stratified_dataset(path='datasets/webhose/dataset.csv', labels='category', samples_per_label=300)
df.text = df.title

KeyboardInterrupt: 

### Dataframe for results
Create a dataframe, that contains all model results!

In [None]:
model_results = pd.DataFrame({'Preprocessing': ['Word Bagging', 'Word Embedding']})

## Prepare Dataset

In [None]:
df.shape

In [None]:
# TfidfVectorizer generates bag of words
from sklearn.feature_extraction.text import TfidfVectorizer

# sublinear_tf: use logarithmic form for frequency
# min_df: minimum numbers of documents a word must be present to keep it
# ngram_range: number of ngrams to use
# stopwords: remove all common pronouns in given language

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1',
                        ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(df.text).toarray()

labels = df.category

print('{} samples represented by {} features'.format(features.shape[0], features.shape[1]))

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['category'], random_state = 42)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_tfidf, y_train)
result = clf.score(X_test_tfidf, y_test)
model_results['NB'] = [result.round(3), None]
print(f"Accuracy: {clf.score(X_test_tfidf, y_test) * 100:.3f}%", )

In [None]:
from sklearn.svm import SVC

svc = SVC().fit(X_train_tfidf, y_train)
result = svc.score(X_test_tfidf, y_test)
model_results['SVC'] = [result, None]
print(f"Accuracy: {svc.score(X_test_tfidf, y_test) * 100:.3f}%", )

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier().fit(X_train_tfidf, y_train)
result = xgb.score(X_test_tfidf, y_test)
model_results['XGB'] = [result, None]
print(f"Accuracy: {xgb.score(X_test_tfidf, y_test) * 100:.3f}%", )

In [None]:
from sklearn.svm import LinearSVC

svc = LinearSVC(random_state=1, dual=False, max_iter=10000)
svc.fit(X_train_tfidf, y_train)
result = svc.score(X_test_tfidf, y_test)
model_results['SVCLinear'] = [result, None]
print(f"Accuracy: {svc.score(X_test_tfidf, y_test) * 100:.3f}%", )

## Word Embeddings

In [None]:
import spacy
from functions import spacy_tokenizer
from time import time

## Word Embeddings (Word Vectors)

In [None]:
# Load loarge model
nlp = spacy.load('en_core_web_lg')

# Disabling other pipes, no need for them, takes too much time
with nlp.disable_pipes():
    vectors = np.array([nlp(text).vector for text in df.text])

In [None]:
vectors.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(vectors, df['category'], random_state = 42)

In [None]:
svc = SVC().fit(X_train, y_train)
result = svc.score(X_test, y_test)
model_results.loc[1, 'SVC'] = result
print(f"Accuracy: {svc.score(X_test, y_test) * 100:.3f}%", )

In [None]:
xgb = XGBClassifier().fit(X_train, y_train)
result = xgb.score(X_test, y_test)
model_results.loc[1, 'XGB'] = result
print(f"Accuracy: {xgb.score(X_test, y_test) * 100:.3f}%", )

In [None]:
svc = LinearSVC(random_state=1, dual=False, max_iter=10000)
svc.fit(X_train, y_train)
result = svc.score(X_test, y_test)
model_results.loc[1, 'SVCLinear'] = result
print(f"Accuracy: {svc.score(X_test, y_test) * 100:.3f}%", )

In [None]:
model_results.round(3)

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1',
                        ngram_range=(1, 2), stop_words='english')

svc = SVC()

# Load loarge model
nlp = spacy.load('en_core_web_lg')

samples = []
word_bagging = []
word_embedding = []
for num_samples in [10, 100, 300, 600, 1000]:
    start_time = time()
    
    samples.append(num_samples)
    
    df = load_stratified_dataset(path='datasets/webhose/dataset.csv', labels='category', samples_per_label=num_samples)
    df.text = df.title

    X_train, X_test, y_train, y_test = train_test_split(df['text'], df['category'], random_state = 42)

    X_train_tfidf = tfidf.fit_transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)

    svc.fit(X_train_tfidf, y_train)
    result = svc.score(X_test_tfidf, y_test)
    word_bagging.append(result)

    # Disabling other pipes, no need for them, takes too much time
    with nlp.disable_pipes():
        vectors = np.array([nlp(text).vector for text in df.text])

    X_train, X_test, y_train, y_test = train_test_split(vectors, df['category'], random_state = 42)

    svc.fit(X_train, y_train)
    result = svc.score(X_test, y_test)
    word_embedding.append(result)
    
    print('Calculation with {} samples finished in {:.1f}s'.format(num_samples, time()-start_time))

In [None]:
plt.plot(samples, word_bagging, label='word bagging')
plt.plot(samples, word_embedding, label='word embedding')
plt.legend();