In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('german')

# Data Preprocessing

In [6]:
# Load the Dataset
df = pd.read_csv("dataset.csv")
# remove rows without label
df = df[df.label != 0]
df = df.dropna()

# shift labels by -2 so that negative = -1, neutral = 0 and positive = 1
df['label'] = [x-2 for x in df['label']]

# get the length of all samples
total_samples = len(df['label'])

# split the data into training and testing data. 2/3 for training, 1/3 for testing
trainData , testData = df[:int(total_samples*(2/3))], df[int(total_samples*(2/3)):]



# Create fthe tf-idf vectorizor
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 1.0,
                             sublinear_tf = True,
                             use_idf = True)

# transform the text into the tf-idf vectorized texts.
train_vectors = vectorizer.fit_transform(trainData['text_lower'])
test_vectors = vectorizer.transform(testData['text_lower'])

In [7]:
import time
from sklearn import svm
from sklearn.metrics import classification_report
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')

# measure time, how long the algorithm take to train
t0 = time.time()

# train the actuall classifier
classifier_linear.fit(train_vectors, trainData['label'])

t1 = time.time()

# predict the labels on the test set
prediction_linear = classifier_linear.predict(test_vectors)

t2 = time.time()

# Calculate the training and testing time
time_linear_train = t1-t0
time_linear_predict = t2-t1
# results
print("Training time: %fs;\n Prediction time: %fs\n" % (time_linear_train, time_linear_predict))
print("_____________________________________________________")
print(classification_report(testData['label'], prediction_linear,digits=4, output_dict=False))
print("_____________________________________________________")



Training time: 3.994404s; Prediction time: 1.665094s

_____________________________________________________
              precision    recall  f1-score   support

        -1.0     0.5182    0.4020    0.4528       495
         0.0     0.5539    0.7455    0.6356       841
         1.0     0.6055    0.4170    0.4939       578

    accuracy                         0.5575      1914
   macro avg     0.5592    0.5215    0.5274      1914
weighted avg     0.5603    0.5575    0.5455      1914

_____________________________________________________


In [8]:
from sklearn.metrics import confusion_matrix

# compute the confusion matrix
cm = confusion_matrix(testData['label'].tolist(), prediction_linear)
print(cm)
pos = cm[0,0] + cm[1,1] + cm[2,2]
neg = cm[0,1] + cm[0,2] + cm[1,0] + cm[1,2] + cm[2,0] + cm[2,1]

[[199 247  49]
 [106 627 108]
 [ 79 258 241]]
Accuracy: 56.00000000000001 %
