# TF-IDF test-split

In [5]:
%matplotlib notebook
import warnings
import numpy as np

# Prevents warnings during cross-validation
warnings.filterwarnings("ignore")

# Number of folds during cross-validation
k = 7

# Number of parallel computations (n_jobs parameter); -1 for utilizing the entire cpu
jobs = -1

# Pseudo-random number generator seed, for reproduceable results
seed = 42

In [6]:
def so_load_data(path):
    with open(path) as f:
        content = f.read().splitlines()
    # you may also want to remove whitespace characters like \n at the end of each line
    return [x.rstrip('\\n') for x in content]

In [7]:
import pandas as pd
import codecs
from tqdm import tqdm

def load_dataset(path):
    dataset = pd.DataFrame(columns=['X', 'y1', 'y2'])
    #print('Loading dataset...')
    with codecs.open(path, "r", encoding='utf-8', errors='ignore') as fdata:
        for line in tqdm(fdata.readlines()):
            line_split = line.split()
            formated = ' '.join(line_split[:-2])
            dataset.loc[-1] = [formated, line_split[-2], line_split[-1]]  # adding a row
            dataset.index = dataset.index + 1  # shifting index
            dataset = dataset.sort_index()  # sorting by index
    return dataset

In [8]:
from sklearn.utils import shuffle

dataset = load_dataset(path = '/home/text_mining_project/text_mining_project_2018/evaluation/germeval2018.training.txt')


100%|██████████| 5009/5009 [00:17<00:00, 290.78it/s]


In [5]:
from sklearn.model_selection import train_test_split
X = dataset['X'].values
y = dataset['y1'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [6]:
from nltk.tokenize import TweetTokenizer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import re

stopwords_german = set(stopwords.words('german'))

usernamePattern = re.compile('@[A-Za-z0-9_]{1,15}')
urlPattern = re.compile('(https?:\/\/)[\/.:\w(1-9)]*\s?')
andPattern = re.compile('&amp;')
lbrPattern = re.compile('|LBR|')
gtPattern = re.compile('&gt;')
ltPattern = re.compile('&lt;')
minusPattern = re.compile('-')
stemmer = SnowballStemmer("german")
tkz = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)

def tokenize(text):
    text = re.sub(usernamePattern, "",  text)
    text = re.sub(urlPattern, "",  text)
    text = re.sub(andPattern, "und", text)
    text = re.sub(lbrPattern, "",  text)
    text = re.sub(gtPattern, ">", text)
    text = re.sub(ltPattern, "<", text)
    text = re.sub(minusPattern, ' ', text)
    output = []
    tokens = tkz.tokenize(text)
    for token in tokens:
        #if token not in stopwords_german:
        if len(token) > 1:
            if token[0] == '#':
                token = token[1:]
            # output.append(stemmer.stem(token))
            output.append(token)
    return output

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

max_features = None
stopwords_german = set(stopwords.words('german'))
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=stopwords_german, max_features=max_features, ngram_range=(1,3))
print('Transforming documents...')
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)
print('Transformation finished!')

Transforming documents...
Transformation finished!


In [8]:
from sklearn.preprocessing import LabelEncoder

def encode_label(y):
    np.ones(len(y))
    choose = lambda l : 1 if l == 'OFFENSE' else 0
    return [choose(l) for l in y]

In [9]:
y_train = encode_label(y_train)
y_test = encode_label(y_test)
print()




In [10]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

#lsvc = LinearSVC(penalty="l1", dual=False).fit(X_train, y_train)
#model = SelectFromModel(lsvc, prefit=True)
#X_train = model.transform(X_train)
#X_test = model.transform(X_test)

In [11]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import  SGDClassifier

from sklearn.model_selection import cross_val_score


names = ["Linear SVM", "SGDClassifier", "BernoulliNB", "LogisticRegression",
         "KNeighborsClassifier", "AdaBoostClassifier", "Random Forest", "Decision Tree"]

classifiers = [
    LinearSVC(random_state=seed),
    SGDClassifier(),
    BernoulliNB(),
    LogisticRegression(random_state=seed, solver='sag', max_iter=1000),
    KNeighborsClassifier(),
    AdaBoostClassifier(),
    RandomForestClassifier(random_state=seed),
    DecisionTreeClassifier(random_state=seed)
]

print('Searching best estimator...')
print()
best_classifier = None
for name, clf in zip(names, classifiers):
    scores = cross_val_score(clf, X_train, y_train, cv=k, n_jobs=jobs)
    print('Mean accuracy %s: %0.3f (+/- %0.3f)' % (name, scores.mean(), scores.std() * 2))
    if not best_classifier:
        best_classifier = (name, scores.mean())
    else:
        if best_classifier[1] < scores.mean():
            best_classifier = (name, scores.mean())
print()
print('Best estimator: %s (mean acc %0.3f, %d-fold cross-validation)' % (best_classifier[0], best_classifier[1], k))

Searching best estimator...

Mean accuracy Linear SVM: 0.707 (+/- 0.022)
Mean accuracy SGDClassifier: 0.723 (+/- 0.037)
Mean accuracy BernoulliNB: 0.665 (+/- 0.008)
Mean accuracy LogisticRegression: 0.669 (+/- 0.006)
Mean accuracy KNeighborsClassifier: 0.697 (+/- 0.019)
Mean accuracy AdaBoostClassifier: 0.720 (+/- 0.024)
Mean accuracy Random Forest: 0.699 (+/- 0.009)
Mean accuracy Decision Tree: 0.695 (+/- 0.036)

Best estimator: SGDClassifier (mean acc 0.723, 7-fold cross-validation)


In [12]:
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer

print('Searching best estimator (F1 score) ...')
print()
best_classifier = None
for name, clf in zip(names, classifiers):
    scores = cross_val_score(clf, X_train, y_train, cv=k, n_jobs=jobs, scoring='f1')
    print('Mean F1 score %s: %0.3f (+/- %0.3f)' % (name, scores.mean(), scores.std() * 2))
    if not best_classifier:
        best_classifier = (name, scores.mean())
    else:
        if best_classifier[1] < scores.mean():
            best_classifier = (name, scores.mean())
print()
print('Best estimator: %s (mean F1 score %0.3f, %d-fold cross-validation)' % (best_classifier[0], best_classifier[1], k))

Searching best estimator (F1 score) ...

Mean F1 score Linear SVM: 0.303 (+/- 0.074)
Mean F1 score SGDClassifier: 0.411 (+/- 0.090)
Mean F1 score BernoulliNB: 0.069 (+/- 0.033)
Mean F1 score LogisticRegression: 0.048 (+/- 0.025)
Mean F1 score KNeighborsClassifier: 0.263 (+/- 0.140)
Mean F1 score AdaBoostClassifier: 0.436 (+/- 0.041)
Mean F1 score Random Forest: 0.239 (+/- 0.044)
Mean F1 score Decision Tree: 0.510 (+/- 0.052)

Best estimator: Decision Tree (mean F1 score 0.510, 7-fold cross-validation)


---

### Test
Test our classifier splitting training data in train and test set

In [9]:
from nltk.tokenize import TweetTokenizer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

def tokenize(text):
    tkz = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
    stemmer = SnowballStemmer("german")
    output = []
    tokens = tkz.tokenize(text)
    for token in tokens:
        if len(token) > 1:
            if token[0] == '#':
                token = token[1:]
            output.append(stemmer.stem(token))
    return output

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


X = dataset['X'].values
y = dataset['y1'].values

lb = LabelEncoder()
lb.fit(['OFFENSE','OTHER'])
y = np.ones(len(y)) - lb.transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

stopwords_german = set(stopwords.words('german'))
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=stopwords_german, ngram_range=(1,3))
print('Transforming documents...')
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)
print('Transformation finished!')

Transforming documents...
Transformation finished!


In [12]:
from sklearn.linear_model import  SGDClassifier
clf = SGDClassifier()
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)
from sklearn.metrics import f1_score
f1=f1_score(predictions, y_test)
print("F1-Score predictions SGDClassifier: ", f1)

F1-Score predictions SGDClassifier:  0.6357615894039734


In [16]:
from sklearn.linear_model import  SGDClassifier
clf = SGDClassifier()
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)
from sklearn.metrics import accuracy_score
acc=accuracy_score(predictions, y_test)
print("Accuracy predictions SGDClassifier: ", acc)

Accuracy predictions SGDClassifier:  0.7784431137724551


In [13]:
from sklearn.linear_model import  SGDClassifier
clf = SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='squared_hinge', max_iter=5,
       n_iter=None, n_jobs=1, penalty='l1', power_t=0.5, random_state=42,
       shuffle=False, tol=None, verbose=0, warm_start=False)
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)
from sklearn.metrics import f1_score
f1=f1_score(predictions, y_test)
print("F1-Score predictions SGDClassifier: ", f1)

F1-Score predictions SGDClassifier:  0.628099173553719


In [18]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel


lsvc = LinearSVC(penalty="l1", dual=False).fit(X_train, y_train)
model = SelectFromModel(lsvc, prefit=True)
X_train = model.transform(X_train)
X_test = model.transform(X_test)

In [19]:
from sklearn.linear_model import  SGDClassifier
clf = SGDClassifier()
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)
from sklearn.metrics import f1_score
f1=f1_score(predictions, y_test)
print("F1-Score predictions SGDClassifier: ", f1)

F1-Score predictions SGDClassifier:  0.5912408759124088


In [20]:
from sklearn.linear_model import  SGDClassifier
clf = SGDClassifier(alpha=1e-05, average=False, class_weight=None, epsilon=0.1,
           eta0=0.0, fit_intercept=True, l1_ratio=0.15,
           learning_rate='optimal', loss='modified_huber', max_iter=3000,
           n_iter=None, n_jobs=1, penalty='none', power_t=0.5, random_state=42,
           shuffle=True, tol=None, verbose=0, warm_start=False)
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)


In [21]:
from sklearn.metrics import f1_score
f1=f1_score(predictions, y_test)
print("F1-Score predictions SGDClassifier: ", f1)

F1-Score predictions SGDClassifier:  0.6149584487534626


In [22]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(predictions, y_test)
print("Accuracy-Score predictions SGDClassifier: ", accuracy)

Accuracy-Score predictions SGDClassifier:  0.7225548902195609
