In [None]:
import pandas as pd
from sklearn import svm
import sklearn.preprocessing as skp
import sklearn.model_selection as ms
import sklearn.metrics as sx
from scipy.spatial.distance import euclidean
import numpy as np

In [None]:
df = pd.read_csv("svm-data.csv", header=None)
df = df.set_index(pd.RangeIndex(start=1, stop=len(df)+1))

In [None]:
y = df.loc[:,df.columns[0]]
X = df.loc[:,df.columns[1:]]
Xs = skp.StandardScaler().fit_transform(X)

cl = svm.SVC(C=10000000, random_state=241, kernel='linear').fit(Xs, y)
s1 = ",".join([str(s) for s in sorted([i for i in df.index[cl.support_]])])

In [None]:
with open('w2_0.txt', mode='w') as f: 
    f.write(s1)

### Task 2

In [None]:
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer
newsgroups = datasets.fetch_20newsgroups(subset='all', categories=['alt.atheism', 'sci.space'])

In [None]:
vectoriser = TfidfVectorizer(stop).fit(newsgroups.data)
X = vectoriser.transform(newsgroups.data)
y = newsgroups.target

In [None]:
grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = ms.KFold(n_splits=5, shuffle=True, random_state=241)
clf = svm.SVC(kernel='linear')
gs = ms.GridSearchCV(clf, grid, scoring='accuracy', cv=cv, n_jobs=1)
gs.fit(X, y)

In [None]:
clf = svm.SVC(kernel='linear', random_state=241, C=10).fit(X,y)

In [None]:
l = [(i, w) for i, w in enumerate(np.ravel(clf.coef_.toarray()))]
l_sorted = sorted(l, key=lambda x: abs(x[1]), reverse=True)
result = np.array(vectoriser.get_feature_names())[[i for i, x in l_sorted[0:10]]]

In [None]:
s = ",".join([str(r) for r in sorted(result)])

In [None]:
with open('w2_1.txt', mode='w') as f: 
    f.write(s)

## Task 3

In [None]:
df = pd.read_csv("data-logistic.csv", header=None)

In [None]:
y = (df.loc[:, df.columns[0]] > 0).astype(int) # converting -1,1  to 0,1
#y = df.loc[:, df.columns[0]].values
X = df.loc[:, df.columns[1:]].values

In [None]:
def grad1(w, k, C):
    return k*np.mean(y*X[:,0]*(1 - np.power(1 + np.exp(-y*(w[0]*X[:,0] + w[1]*X[:,1])), -1))) - k*C*w[0]

def grad2(w, k, C):
    return k*np.mean(y*X[:,1]*(1 - np.power(1 + np.exp(-y*(w[0]*X[:,0] + w[1]*X[:,1])), -1))) - k*C*w[1]

In [None]:
max_iter = 10000
w0 = np.array([0,0])
i = 0
eps = 1
w = np.array(w0)
C= 0.
k = 0.1
while (i <= max_iter) and (eps >=1e-5):
    z = np.array([w[0] + grad1(w, k=k, C=C),  w[1] + grad2(w, k=k, C=C)])
    eps = euclidean(z, w)
    w = z
    i += 1 

In [None]:
def expit(x, w):
    return np.power(1 + np.exp(-np.dot(x, w)), -1)

In [None]:
y_hat = np.apply_along_axis(lambda z: expit(z, w), 1, X)

In [None]:
result = sx.roc_auc_score(y, y_hat)

In [None]:
with open("w3_41.txt", mode='w') as f:
    f.write("%.3f" % result)

## Task 4

In [None]:
scores = pd.read_csv("scores.csv")

In [None]:
classification = pd.read_csv("classification.csv")

In [None]:
classification.head()

In [None]:
CM = sx.confusion_matrix(classification['true'], classification['pred'])

In [None]:
TN = CM[0,0]
TP = CM[1,1]
FP = CM[0,1]
FN = CM[1,0]

In [None]:
with open("w4_1.txt", mode='w') as f:
    f.write("%d %d %d %d" % (TP, FP, FN, TN))

In [None]:
acc = sx.accuracy_score(classification['true'], classification['pred'])
prec = sx.precision_score(classification['true'], classification['pred'])
recall = sx.recall_score(classification['true'], classification['pred'])
f1 = sx.f1_score(classification['true'], classification['pred'])

In [None]:
with open("w4_2.txt", mode='w') as f:
    f.write("%.2f %.2f %.2f %.2f" % (acc, prec, recall, f1))

In [None]:
def scores_rauc(col):
    return sx.roc_auc_score(scores['true'], scores[col])

In [None]:
def scores_pr(col):
    prec, recall, th  = sx.precision_recall_curve(scores['true'], scores[col] )
    return np.max(prec[recall > 0.7]) 

In [None]:
raucs = [(c,scores_rauc(c)) for c in scores.columns if c != 'true']

In [None]:
prs  = [(c,scores_pr(c)) for c in scores.columns if c != 'true']

In [None]:
prs

In [None]:
max(raucs, key=lambda x: x[1])[0]

In [None]:
with open("w4_3.txt", mode='w') as f:
    f.write("%s" % (max(raucs, key=lambda x: x[1])[0]))

In [None]:
with open("w4_4.txt", mode='w') as f:
    f.write("%s" % (max(prs, key=lambda x: x[1])[0]))