In [None]:
import pandas as pd
import sys
import numpy as np
import argparse
import random
from collections import Counter
from scipy.sparse import csr_matrix
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from gensim.utils import simple_preprocess

In [None]:
# load data
df = pd.read_csv("../input/feedback-prize-2021/train.csv")

In [None]:
# labels
labels = {'Lead':0,
          'Position':1,
          'Claim':2,
          'Concluding Statement':3,
          'Evidence':4,
          'Counterclaim':5,
          'Rebuttal':6
          }

In [None]:
# change labels to int
labels_int = []
for label in df['discourse_type']:
    labels_int.append(labels[label])

df['label'] = labels_int

In [None]:
# Tokenize the text column to get the new column 'tokenized_text'
df['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in df['discourse_text']] 
print(df['tokenized_text'].head(10))

In [None]:
# split training and test dataset
random.seed(42)
train, test = train_test_split(df, test_size = 0.2, stratify = df['label'])
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [None]:
# counter different classes
counter_all = {}
for i in range(len(train)):
    label = df['label'][i]
    text = df['tokenized_text'][i]
    if label not in counter_all:
        counter_all[label] = Counter()
    counter = counter_all[label]
    # update the counter
    counter.update(Counter(text))
# counter_all is the sum counter of different class

In [None]:
# calculate log-count ratio of NB features
ratios = dict()
alpha = 0.8 # smoothing parameter

# create a list of all tokens
tokens = set()
for counter in counter_all.values():
    tokens.update(counter.keys())
tokens = list(tokens)
V = len(tokens) # size of all tokens

dic = dict((t, i) for i, t in enumerate(tokens))
# sum tokens counts for all classes with alpha smoothing
# 2* because one gets subtracted when q_c is calculate by subtracting p_c
sum_counts = np.full(V, 2*alpha)
for label in counter_all:
    counter = counter_all[label]
    for token in tokens:
        sum_counts[dic[token]] += counter[token]
        
# calculate the ratio of different labels
for label in counter_all:
    counter = counter_all[label]
    p_label = np.full(V, alpha)  # initialize p value with alpha

    # add the tokens counts
    for token in tokens:
        p_label[dic[token]] += counter[token]

    # initialize q value
    q_label = sum_counts - p_label

    # normalize (l1 norm)
    p_label /= np.linalg.norm(p_label, ord=1)
    q_label /= np.linalg.norm(q_label, ord=1)

    # p_label = log(p/|p|)
    p_label = np.log(p_label)
    # q_label = log(not_p/|not_p|)
    q_label = np.log(q_label)

    # Subtract log(not_p/|not_p|)
    ratios[label] = p_label - q_label

In [None]:
# change training and test data into matrix form
def load_data(df,dic,V,ratios):
    N = len(df)
    classes = ratios.keys()
    Y_true = np.zeros(N, dtype=np.int64)
    
    # One X (sample) matrix and binary Y (truth) per class
    X = dict()
    Y = dict()
    ratio = dict()
    indptr = [0]
    indices = []
    for c in classes:
        Y[c] = np.zeros(N, dtype=np.int64)
        ratio[c] = []
    
    for i in range(len(df)):
        for c in classes:
            Y[c][i] = int(c == df['label'][i])
        Y_true[i] = df['label'][i]
        
        tokens_count = Counter(df['tokenized_text'][i])
        for token in tokens_count:
            if token in dic:
                index = dic[token]
                indices.append(index)
                for c in classes:
                    ratio[c].append(ratios[c][index])
        indptr.append(len(indices))
    
    for c in classes:
        X[c] = csr_matrix((ratio[c], indices, indptr), shape=(N, V), dtype=np.float32)
    
    return X,Y,Y_true

In [None]:
X_train, Y_train, Y_train_true = load_data(train,dic,V,ratios)
X_test, Y_test, Y_test_true = load_data(test,dic,V,ratios)

In [None]:
# SVM fit
svms = dict()
for label in ratios.keys():
    svms[label] = LinearSVC(max_iter = 5000)
    svms[label].fit(X_train[label], Y_train[label])

# SVM prediction
preds = dict()
for label in ratios.keys():
    preds[label] = svms[label].decision_function(X_test[label])

# calculate the prediction into true label
pred = np.zeros(len(Y_test_true))
for idx in range(len(Y_test_true)):
    max_score = float('-inf')
    for label in ratios.keys():
        if preds[label][idx] > max_score:
            max_score = preds[label][idx]
            pred[idx] = label
            
# accuarcy
acc_svm = accuracy_score(Y_test_true, pred)
print('NBSVM: %f' % (acc_svm,))