In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from sklearn.metrics import f1_score

#### Read data

In [3]:
file_name = 'data/training.txt'

with open(file_name, 'r', encoding='utf8') as inp:
    corpus = inp.read()

corpus = corpus.split('\n')

In [4]:
np.random.shuffle(corpus)

In [5]:
# split target and text

texts = []
targets = []

for line in corpus:
    if len(line) == 0:
        continue
        
    target, text = line.split('\t')
    texts.append(text)
    targets.append(int(target))


In [6]:
print('Total number of samples: {}'.format(len(texts)))
print('Positives: {}, negatives: {}'.format(sum(targets), len(targets) - sum(targets)))

Total number of samples: 7086
Positives: 3995, negatives: 3091


#### Split train and validation

In [7]:
validation_size = 3500

train_y = targets[:validation_size]
val_y = targets[validation_size:]

train_x = texts[:validation_size]
val_x = texts[validation_size:]

In [8]:
print('{} positives and {} negatives in train'.format(sum(train_y), len(train_y) - sum(train_y)))
print('{} positives and {} negatives in validation'.format(sum(val_y), len(val_y) - sum(val_y)))

1993 positives and 1507 negatives in train
2002 positives and 1584 negatives in validation


#### Bag of Words

In [9]:
vectorizer = CountVectorizer(min_df=1)

train_x_vec = vectorizer.fit_transform(train_x)
val_x_vec = vectorizer.transform(val_x)

In [10]:
cls = MultinomialNB()
cls.fit(train_x_vec, train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [11]:
predictions = cls.predict(val_x_vec)

In [12]:
f1_score(val_y, predictions)

0.9826043737574551

In [13]:
cls = SVC()
cls.fit(train_x_vec, train_y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [14]:
predictions = cls.predict(val_x_vec)

In [15]:
f1_score(val_y, predictions)

0.9394226218646474

#### TfIdf

In [16]:
vectorizer = TfidfVectorizer()

train_x_tfifd = vectorizer.fit_transform(train_x)
val_x_tfidf = vectorizer.transform(val_x)

In [17]:
cls = MultinomialNB()
cls.fit(train_x_tfifd, train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [18]:
predictions = cls.predict(val_x_tfidf)

In [19]:
f1_score(val_y, predictions)

0.978913420987348

In [20]:
cls = SVC()
cls.fit(train_x_tfifd, train_y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
predictions = cls.predict(val_x_tfidf)

In [22]:
f1_score(val_y, predictions)

0.7165354330708662