In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('codeforce_user_performance.csv')

In [3]:
class_counts = df['rank-type'].value_counts()
print(class_counts)

rank-type
Master              628
Candidate Master    411
Name: count, dtype: int64


In [4]:
#Impute missing value by mean
imputer = SimpleImputer(strategy='mean')
impute = ['contest'+str(i) for i in range(1,11)]
df[impute] = imputer.fit_transform(df[impute]).astype(int)

In [5]:
#Normalize all feature to z-value
scaler = StandardScaler()
df[impute] = scaler.fit_transform(df[impute])

In [6]:
def train_test_split(x,y,test_size):
    indices = np.arange(len(x))
    np.random.shuffle(indices)
    x_shuffled = x[indices]
    y_shuffled = y[indices]
    pos = int(y.shape[0]*(1-test_size)) #number of train size
    return x_shuffled[:pos],x_shuffled[pos:],y_shuffled[:pos],y_shuffled[pos:]

In [7]:
X = df.drop(columns=['rank-type','userid']).values
y = df['rank-type'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, 0.2)

In [8]:
class SVM:
    def __init__(self, d, a):
        self.delta = d
        self.alpha = a

    def gradient(self, x, y):
        g = np.zeros(self.w.shape[0])
        for xi,yi in zip(x,y):
            if np.sum(np.dot(self.w,xi))*yi < 1:
                g -= xi*yi
        return g

    def l2norm(self, X):
        return np.sqrt(np.sum(np.dot(X,X)))
    
    def fit(self, X, y):
        self.X = X[:]
        self.y = y[:]
        self.train()

    def train(self):
        self.class_value = {}
        for i,c in enumerate(np.unique(self.y)):
            self.class_value[2*i-1] = c
            self.y[self.y == c] = 2*i-1
        self.w = np.zeros(self.X.shape[1])

        while True:
            grad = self.gradient(self.X, self.y)
            w_new = self.w - self.alpha * grad
            if self.l2norm(w_new-self.w) < self.delta:
                break 
            self.w = w_new.copy()

    def predict(self, X):
        y_pred = np.sign(np.dot(X, self.w)+1e-9).astype(int)
        return [self.class_value[pred] for pred in y_pred]
    
    def accuracy_score(self, X, y):
        y_pred = self.predict(X)
        return np.sum(y_pred == y)/y.shape[0]


In [9]:
classifier = SVM(1e-3,1e-5)
classifier.fit(X_train,y_train)

In [10]:
print('Accuracy:',classifier.accuracy_score(X_test, y_test))

Accuracy: 0.8461538461538461
