In [1]:
%matplotlib notebook
import warnings
import numpy as np

# Prevents warnings during cross-validation
warnings.filterwarnings("ignore")

# Number of folds during cross-validation
k = 7

# Number of parallel computations (n_jobs parameter); -1 for utilizing the entire cpu
jobs = -1

# Pseudo-random number generator seed, for reproduceable results
seed = 42

In [2]:
import pandas as pd
import codecs
from tqdm import tqdm

def load_dataset(path):
    dataset = pd.DataFrame(columns=['X', 'y1', 'y2'])
    #print('Loading dataset...')
    with codecs.open(path, "r", encoding='utf-8', errors='ignore') as fdata:
        for line in tqdm(fdata.readlines()):
            line_split = line.split()
            formated = ' '.join(line_split[:-3])
            dataset.loc[-1] = [formated, line_split[-2], line_split[-1]]  # adding a row
            dataset.index = dataset.index + 1  # shifting index
            dataset = dataset.sort_index()  # sorting by index
    return dataset

In [3]:
import pandas as pd

def load_dataset_ext(path):
    dataset = pd.DataFrame(columns=['X', 'y1', 'y2', 'rating'])
    with codecs.open(path, "r", encoding='utf-8', errors='ignore') as fdata:
        for line in tqdm(fdata.readlines()):
            line = line.strip()
            line_split = line.split(';')
            formated = ' '.join(line_split[:-3])
            dataset.loc[-1] = [formated, line_split[-3], line_split[-2], line_split[-1]]  # adding a row
            dataset.index = dataset.index + 1  # shifting index
            dataset = dataset.sort_index()  # sorting by index
    return dataset.drop(dataset.index[[-1]])

In [4]:
# dataset = load_dataset(path = '/home/text_mining_project/text_mining_project_2018/evaluation/germeval2018.training.txt')

In [5]:
dataset_ext = load_dataset_ext('/home/text_mining_project/text_mining_project_2018/evaluation/german_hatespeech_refugees.csv')

100%|██████████| 470/470 [00:01<00:00, 310.43it/s]


In [6]:
from sklearn.model_selection import train_test_split
X = dataset_ext['X'].values

def encode_label_bin(y, predicted_label):
    choose = lambda l : True if l == predicted_label else False
    return [choose(l) for l in y]

y1 = np.array(encode_label_bin(dataset_ext['y1'], 'YES'))
y2 = np.array(encode_label_bin(dataset_ext['y2'], 'YES'))

y = (y1 | y2)*1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=23)

In [7]:
from nltk.tokenize import TweetTokenizer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import re

stopwords_german = set(stopwords.words('german'))

usernamePattern = re.compile('@[A-Za-z0-9_]{1,15}')
urlPattern = re.compile('(https?:\/\/)[\/.:\w(1-9)]*\s?')
andPattern = re.compile('&amp;')
lbrPattern = re.compile('|LBR|')
gtPattern = re.compile('&gt;')
ltPattern = re.compile('&lt;')
minusPattern = re.compile('-')
stemmer = SnowballStemmer("german")
tkz = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)

def tokenize(text):
    text = re.sub(usernamePattern, "",  text)
    text = re.sub(urlPattern, "",  text)
    text = re.sub(andPattern, "und", text)
    text = re.sub(lbrPattern, "",  text)
    text = re.sub(gtPattern, ">", text)
    text = re.sub(ltPattern, "<", text)
    text = re.sub(minusPattern, ' ', text)
    output = []
    tokens = tkz.tokenize(text)
    for token in tokens:
        #if token not in stopwords_german:
        if len(token) > 1:
            if token[0] == '#':
                token = token[1:]
            # output.append(stemmer.stem(token))
            output.append(token)
    return output

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

max_features = None
stopwords_german = set(stopwords.words('german'))
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=stopwords_german, max_features=max_features, ngram_range=(1,3))
print('Transforming documents...')
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)
print('Transformation finished!')

Transforming documents...
Transformation finished!


In [9]:
from sklearn.preprocessing import LabelEncoder

def encode_label(y):
    np.ones(len(y))
    choose = lambda l : 1 if l == 'OFFENSE' else 0
    return [choose(l) for l in y]

In [10]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import  SGDClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer

from sklearn.model_selection import cross_val_score


names = ["Linear SVM", "SGDClassifier", "BernoulliNB", "LogisticRegression",
         "KNeighborsClassifier", "AdaBoostClassifier", "Random Forest", "Decision Tree"]

classifiers = [
    LinearSVC(random_state=seed),
    SGDClassifier(),
    BernoulliNB(),
    LogisticRegression(random_state=seed, solver='sag', max_iter=1000),
    KNeighborsClassifier(),
    AdaBoostClassifier(),
    RandomForestClassifier(random_state=seed),
    DecisionTreeClassifier(random_state=seed)
]

print('Searching best estimator (F1 score) ...')
print()
best_classifier = None
for name, clf in zip(names, classifiers):
    scores = cross_val_score(clf, X_train, y_train, cv=k, n_jobs=jobs, scoring='f1')
    print('Mean F1 score %s: %0.3f (+/- %0.3f)' % (name, scores.mean(), scores.std() * 2))
    if not best_classifier:
        best_classifier = (name, scores.mean())
    else:
        if best_classifier[1] < scores.mean():
            best_classifier = (name, scores.mean())
print()
print('Best estimator: %s (mean F1 score %0.3f, %d-fold cross-validation)' % (best_classifier[0], best_classifier[1], k))

Searching best estimator (F1 score) ...

Mean F1 score Linear SVM: 0.139 (+/- 0.247)
Mean F1 score SGDClassifier: 0.213 (+/- 0.266)
Mean F1 score BernoulliNB: 0.000 (+/- 0.000)
Mean F1 score LogisticRegression: 0.000 (+/- 0.000)
Mean F1 score KNeighborsClassifier: 0.353 (+/- 0.177)
Mean F1 score AdaBoostClassifier: 0.542 (+/- 0.153)
Mean F1 score Random Forest: 0.088 (+/- 0.180)
Mean F1 score Decision Tree: 0.550 (+/- 0.099)

Best estimator: Decision Tree (mean F1 score 0.550, 7-fold cross-validation)


In [15]:
from sklearn.linear_model import  SGDClassifier
clf = SGDClassifier()
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)
from sklearn.metrics import f1_score
f1=f1_score(predictions, y_test)
print("F1-Score predictions: ", f1)

F1-Score predictions:  0.5185185185185185


In [14]:
from sklearn.linear_model import  SGDClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)
from sklearn.metrics import f1_score
f1=f1_score(predictions, y_test)
print("F1-Score predictions: ", f1)

F1-Score predictions SGDClassifier:  0.30769230769230765


In [22]:
from sklearn.linear_model import  SGDClassifier
clf = AdaBoostClassifier()
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)
from sklearn.metrics import f1_score
f1=f1_score(predictions, y_test)
print("F1-Score predictions: ", f1)

F1-Score predictions:  0.32


## Feature Selection

In [16]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

lsvc = LinearSVC(penalty="l1", dual=False).fit(X_train, y_train)
model = SelectFromModel(lsvc, prefit=True)
X_train = model.transform(X_train)
X_test = model.transform(X_test)

In [17]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import  SGDClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer

from sklearn.model_selection import cross_val_score


names = ["Linear SVM", "SGDClassifier", "BernoulliNB", "LogisticRegression",
         "KNeighborsClassifier", "AdaBoostClassifier", "Random Forest", "Decision Tree"]

classifiers = [
    LinearSVC(random_state=seed),
    SGDClassifier(),
    BernoulliNB(),
    LogisticRegression(random_state=seed, solver='sag', max_iter=1000),
    KNeighborsClassifier(),
    AdaBoostClassifier(),
    RandomForestClassifier(random_state=seed),
    DecisionTreeClassifier(random_state=seed)
]

print('Searching best estimator (F1 score) ...')
print()
best_classifier = None
for name, clf in zip(names, classifiers):
    scores = cross_val_score(clf, X_train, y_train, cv=k, n_jobs=jobs, scoring='f1')
    print('Mean F1 score %s: %0.3f (+/- %0.3f)' % (name, scores.mean(), scores.std() * 2))
    if not best_classifier:
        best_classifier = (name, scores.mean())
    else:
        if best_classifier[1] < scores.mean():
            best_classifier = (name, scores.mean())
print()
print('Best estimator: %s (mean F1 score %0.3f, %d-fold cross-validation)' % (best_classifier[0], best_classifier[1], k))

Searching best estimator (F1 score) ...

Mean F1 score Linear SVM: 0.428 (+/- 0.216)
Mean F1 score SGDClassifier: 0.647 (+/- 0.162)
Mean F1 score BernoulliNB: 0.680 (+/- 0.140)
Mean F1 score LogisticRegression: 0.000 (+/- 0.000)
Mean F1 score KNeighborsClassifier: 0.549 (+/- 0.202)
Mean F1 score AdaBoostClassifier: 0.614 (+/- 0.110)
Mean F1 score Random Forest: 0.562 (+/- 0.131)
Mean F1 score Decision Tree: 0.601 (+/- 0.105)

Best estimator: BernoulliNB (mean F1 score 0.680, 7-fold cross-validation)


In [18]:
from sklearn.linear_model import  SGDClassifier
clf = SGDClassifier()
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)
from sklearn.metrics import f1_score
f1=f1_score(predictions, y_test)
print("F1-Score predictions: ", f1)

F1-Score predictions:  0.4


In [19]:
from sklearn.linear_model import  SGDClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)
from sklearn.metrics import f1_score
f1=f1_score(predictions, y_test)
print("F1-Score predictions: ", f1)

F1-Score predictions:  0.3448275862068966


In [20]:
from sklearn.linear_model import  SGDClassifier
clf = BernoulliNB()
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)
from sklearn.metrics import f1_score
f1=f1_score(predictions, y_test)
print("F1-Score predictions: ", f1)

F1-Score predictions:  0.3333333333333333


In [21]:
from sklearn.linear_model import  SGDClassifier
clf = AdaBoostClassifier()
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)
from sklearn.metrics import f1_score
f1=f1_score(predictions, y_test)
print("F1-Score predictions: ", f1)

F1-Score predictions:  0.32
