In [6]:
%matplotlib notebook
import warnings
import numpy as np

# Prevents warnings during cross-validation
warnings.filterwarnings("ignore")

# Number of folds during cross-validation
k = 7

# Number of parallel computations (n_jobs parameter); -1 for utilizing the entire cpu
jobs = -1

# Pseudo-random number generator seed, for reproduceable results
seed = 42

In [7]:
import pandas as pd

def load_dataset_ext(path):
    df = pd.read_csv(path, index_col=0)
    df.drop(columns=['count'])
    df_droped = df.drop(columns=['count', 'hate_speech', 'offensive_language', 'neither'])
    df_droped.columns = ['y', 'X']
    return df_droped.replace([0, 1], 'OFFENSE').replace(2, 'OTHER')

In [8]:
dataset_davidson = load_dataset_ext('/home/text_mining_project/t-davidson.csv')

In [9]:
from sklearn.model_selection import train_test_split
X = dataset_davidson['X'].values
y = dataset_davidson['y'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [10]:
from nltk.tokenize import TweetTokenizer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import re

stopwords_german = set(stopwords.words('german'))

usernamePattern = re.compile('@[A-Za-z0-9_]{1,15}')
urlPattern = re.compile('(https?:\/\/)[\/.:\w(1-9)]*\s?')
andPattern = re.compile('&amp;')
lbrPattern = re.compile('|LBR|')
gtPattern = re.compile('&gt;')
ltPattern = re.compile('&lt;')
minusPattern = re.compile('-')
stemmer = SnowballStemmer("german")
tkz = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)

def tokenize(text):
    text = re.sub(usernamePattern, "",  text)
    text = re.sub(urlPattern, "",  text)
    text = re.sub(andPattern, "und", text)
    text = re.sub(lbrPattern, "",  text)
    text = re.sub(gtPattern, ">", text)
    text = re.sub(ltPattern, "<", text)
    text = re.sub(minusPattern, ' ', text)
    output = []
    tokens = tkz.tokenize(text)
    for token in tokens:
        #if token not in stopwords_german:
        if len(token) > 1:
            if token[0] == '#':
                token = token[1:]
            # output.append(stemmer.stem(token))
            output.append(token)
    return output

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

max_features = None
stopwords_german = set(stopwords.words('german'))
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=stopwords_german, max_features=max_features, ngram_range=(1,3))
print('Transforming documents...')
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)
print('Transformation finished!')

Transforming documents...
Transformation finished!


In [12]:
from sklearn.preprocessing import LabelEncoder

def encode_label(y):
    np.ones(len(y))
    choose = lambda l : 1 if l == 'OFFENSE' else 0
    return [choose(l) for l in y]

In [13]:
y_train = encode_label(y_train)
y_test = encode_label(y_test)
print()




In [9]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import  SGDClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer

from sklearn.model_selection import cross_val_score


names = ["Linear SVM", "SGDClassifier", "BernoulliNB", "LogisticRegression",
         "KNeighborsClassifier", "AdaBoostClassifier", "Random Forest", "Decision Tree"]

classifiers = [
    LinearSVC(random_state=seed),
    SGDClassifier(),
    BernoulliNB(),
    LogisticRegression(random_state=seed, solver='sag', max_iter=1000),
    KNeighborsClassifier(),
    AdaBoostClassifier(),
    RandomForestClassifier(random_state=seed),
    DecisionTreeClassifier(random_state=seed)
]

In [10]:
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer

print('Searching best estimator (F1 score) ...')
print()
best_classifier = None
for name, clf in zip(names, classifiers):
    scores = cross_val_score(clf, X_train, y_train, cv=k, n_jobs=jobs, scoring='f1')
    print('Mean F1 score %s: %0.3f (+/- %0.3f)' % (name, scores.mean(), scores.std() * 2))
    if not best_classifier:
        best_classifier = (name, scores.mean())
    else:
        if best_classifier[1] < scores.mean():
            best_classifier = (name, scores.mean())
print()
print('Best estimator: %s (mean F1 score %0.3f, %d-fold cross-validation)' % (best_classifier[0], best_classifier[1], k))

Searching best estimator (F1 score) ...

Mean F1 score Linear SVM: 0.962 (+/- 0.002)
Mean F1 score SGDClassifier: 0.949 (+/- 0.004)
Mean F1 score BernoulliNB: 0.909 (+/- 0.001)
Mean F1 score LogisticRegression: 0.928 (+/- 0.003)
Mean F1 score KNeighborsClassifier: 0.931 (+/- 0.004)
Mean F1 score AdaBoostClassifier: 0.967 (+/- 0.008)
Mean F1 score Random Forest: 0.940 (+/- 0.006)
Mean F1 score Decision Tree: 0.968 (+/- 0.005)

Best estimator: Decision Tree (mean F1 score 0.968, 7-fold cross-validation)


In [17]:
from sklearn.linear_model import  SGDClassifier
from sklearn.metrics import f1_score, accuracy_score

clf = SGDClassifier()
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)

f1=f1_score(predictions, y_test)
print("F1-Score predictions: ", f1)

acc=accuracy_score(predictions, y_test)
print("Accuracy-Score predictions: ", acc)

F1-Score predictions:  0.9544600938967136
Accuracy-Score predictions:  0.9217426381605486


In [18]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)

f1=f1_score(predictions, y_test)
print("F1-Score predictions: ", f1)

acc=accuracy_score(predictions, y_test)
print("Accuracy-Score predictions: ", acc)

F1-Score predictions:  0.9648913303080965
Accuracy-Score predictions:  0.9407018959257766


In [19]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier()
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)

f1=f1_score(predictions, y_test)
print("F1-Score predictions: ", f1)

acc=accuracy_score(predictions, y_test)
print("Accuracy-Score predictions: ", acc)

F1-Score predictions:  0.9689807976366321
Accuracy-Score predictions:  0.9491730536506656


## Feature Selection

In [14]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

lsvc = LinearSVC(penalty="l1", dual=False).fit(X_train, y_train)
model = SelectFromModel(lsvc, prefit=True)
X_train = model.transform(X_train)
X_test = model.transform(X_test)

In [15]:
print('Searching best estimator (F1 score) ...')
print()
best_classifier = None
for name, clf in zip(names, classifiers):
    scores = cross_val_score(clf, X_train, y_train, cv=k, n_jobs=jobs, scoring='f1')
    print('Mean F1 score %s: %0.3f (+/- %0.3f)' % (name, scores.mean(), scores.std() * 2))
    if not best_classifier:
        best_classifier = (name, scores.mean())
    else:
        if best_classifier[1] < scores.mean():
            best_classifier = (name, scores.mean())
print()
print('Best estimator: %s (mean F1 score %0.3f, %d-fold cross-validation)' % (best_classifier[0], best_classifier[1], k))

Searching best estimator (F1 score) ...

Mean F1 score Linear SVM: 0.974 (+/- 0.004)
Mean F1 score SGDClassifier: 0.937 (+/- 0.004)
Mean F1 score BernoulliNB: 0.969 (+/- 0.005)
Mean F1 score LogisticRegression: 0.929 (+/- 0.005)
Mean F1 score KNeighborsClassifier: 0.948 (+/- 0.004)
Mean F1 score AdaBoostClassifier: 0.967 (+/- 0.008)
Mean F1 score Random Forest: 0.967 (+/- 0.008)
Mean F1 score Decision Tree: 0.970 (+/- 0.005)

Best estimator: Linear SVM (mean F1 score 0.974, 7-fold cross-validation)


In [16]:
from sklearn.metrics import f1_score

clf = SGDClassifier()
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)

f1=f1_score(predictions, y_test)
print("F1-Score predictions: ", f1)

F1-Score predictions:  0.9443413729128015


In [17]:
from sklearn.metrics import f1_score

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)

f1=f1_score(predictions, y_test)
print("F1-Score predictions: ", f1)

F1-Score predictions:  0.9636537541846006


In [18]:
from sklearn.metrics import f1_score

clf = AdaBoostClassifier()
clf.fit(X_train, y_train)
predictions=clf.predict(X_test)

f1=f1_score(predictions, y_test)
print("F1-Score predictions: ", f1)

F1-Score predictions:  0.9689807976366321
