In [1]:
%matplotlib notebook
import warnings
import numpy as np

# Prevents warnings during cross-validation
warnings.filterwarnings("ignore")

# Number of folds during cross-validation
k = 7

# Number of parallel computations (n_jobs parameter); -1 for utilizing the entire cpu
jobs = -1

# Pseudo-random number generator seed, for reproduceable results
seed = 42

In [2]:
def predictions_to_txt(filename, X_test, y_pred):
    i=0
    results = []
    for x in X_test:
        line = []
        line.append(x)
        if y_pred[i] == 1:
            line.append('OFFENSE')
        else:
            line.append('OTHER')
        line.append('OTHER')
        i += 1
        results.append(line)

    predictions = results
    np.savetxt(filename, predictions, fmt=['%s', '%s', '%s'], delimiter='\t', newline='\n', \
                  header='', footer='', encoding='utf-8')

In [3]:
def so_load_data(path):
    with open(path) as f:
        content = f.read().splitlines()
    # you may also want to remove whitespace characters like \n at the end of each line
    return [x.rstrip('\\n') for x in content]

In [4]:
import pandas as pd
import codecs
from tqdm import tqdm

def load_dataset(path):
    dataset = pd.DataFrame(columns=['X', 'y1', 'y2'])
    #print('Loading dataset...')
    with codecs.open(path, "r", encoding='utf-8', errors='ignore') as fdata:
        for line in tqdm(fdata.readlines()):
            line_split = line.split()
            formated = ' '.join(line_split[:-2])
            dataset.loc[-1] = [formated, line_split[-2], line_split[-1]]  # adding a row
            dataset.index = dataset.index + 1  # shifting index
            dataset = dataset.sort_index()  # sorting by index
    return dataset

In [5]:
dataset = load_dataset(path = '/home/text_mining_project/text_mining_project_2018/evaluation/germeval2018.training.txt')

100%|██████████| 5009/5009 [00:14<00:00, 342.76it/s]


In [6]:
X_train = dataset['X'].values
y_train = dataset['y1'].values

X_test = so_load_data(path = '/home/text_mining_project/germeval2018.test.txt')
X_test_raw = X_test

In [7]:
from nltk.tokenize import TweetTokenizer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import re

stopwords_german = set(stopwords.words('german'))

usernamePattern = re.compile('@[A-Za-z0-9_]{1,15}')
urlPattern = re.compile('(https?:\/\/)[\/.:\w(1-9)]*\s?')
andPattern = re.compile('&amp;')
lbrPattern = re.compile('|LBR|')
gtPattern = re.compile('&gt;')
ltPattern = re.compile('&lt;')
minusPattern = re.compile('-')
stemmer = SnowballStemmer("german")
tkz = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)

def tokenize(text):
    text = re.sub(usernamePattern, "",  text)
    text = re.sub(urlPattern, "",  text)
    text = re.sub(andPattern, "und", text)
    text = re.sub(lbrPattern, "",  text)
    text = re.sub(gtPattern, ">", text)
    text = re.sub(ltPattern, "<", text)
    text = re.sub(minusPattern, ' ', text)
    output = []
    tokens = tkz.tokenize(text)
    for token in tokens:
        #if token not in stopwords_german:
        if len(token) > 1:
            if token[0] == '#':
                token = token[1:]
            # output.append(stemmer.stem(token))
            output.append(token)
    return output

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

max_features = None
stopwords_german = set(stopwords.words('german'))
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=stopwords_german, max_features=max_features, ngram_range=(1,3))
print('Transforming documents...')
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)
print('Transformation finished!')

Transforming documents...
Transformation finished!


In [9]:
def encode_label(y):
    np.ones(len(y))
    choose = lambda l : 1 if l == 'OFFENSE' else 0
    return [choose(l) for l in y]

In [10]:
y_train = encode_label(y_train)
print()




In [11]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

#lsvc = LinearSVC(penalty="l1", dual=False).fit(X_train, y_train)
#model = SelectFromModel(lsvc, prefit=True)
#X_train = model.transform(X_train)
#X_test = model.transform(X_test)

In [12]:
from sklearn.svm import LinearSVC
clf = LinearSVC(C=3, class_weight=None, dual=False, fit_intercept=False,
     intercept_scaling=0.1, loss='squared_hinge', max_iter=10,
     multi_class='ovr', penalty='l1', random_state=42, tol=0.0001,
     verbose=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [13]:
#from sklearn.naive_bayes import BernoulliNB
#clf = BernoulliNB()
#clf.fit(X_train, y_train)
#y_pred = clf.predict(X_test)

In [14]:
predictions_to_txt('upInf_coarse_1.txt', X_test_raw, y_pred)