In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
from sklearn import preprocessing
from sklearn import feature_selection
from sklearn import svm, semi_supervised

In [None]:
def load_data(train_only=True):
    data_train = []
    for i in range(1, 6):
        data_train.append(pd.read_csv('open_data_train_valid/train/train_{}.txt'.format(i), sep='\t'))
    data_train = pd.concat(data_train)
    if train_only:
        return data_train
    data_val = pd.read_csv('open_data_train_valid/val')
    return data_train, data_val


def preprocess(data_train, data_val=None):
    for i in range(1, 6746):  # 6745 features in total
        feat = 'f' + str(i)
        data_train[feat] = data_train[feat].fillna(data_train[feat].dropna().median())
        # normalize
        scaler = preprocessing.StandardScaler()
        data_train[feat] = scaler.fit_transform(data_train[feat].values.reshape(-1, 1))
        if data_val is not None:
            data_val[feat] = scaler.transform(data_val[feat].values.reshape(-1, 1))
        # set unknown labels
        data_train.loc[data_train['tag'] == 1, data_train['label']] = -1


def feature_selection(data, labels):
    fselector = feature_selection.SelectKBest(score_func=feature_selection.mutual_info_classif, k=100)
    fselector.fit(data, labels)
    mask = fselector.get_support()
    return mask
    
    
def label_propagation(data, labels):
    label_prop_model = semi_supervised.LabelPropagation(kernel='rbf', gamma=20)
    label_prop_model.fit(data, labels)
    new_labels = label_prop_model.predict(data)
    return new_labels


def train_svm_classifier(data, labels):
    classifier = svm.LinearSVC()
    classifier.fit(data, labels)
    acc = classifier.score(data, labels)
    print('training acc: {}'.format(acc))
    return classifier

In [None]:
# data_train = pd.read_csv('open_data_train_valid/train/train_{}.txt'.format(2), sep='\t')
data_train, data_val = load_data()
print(data_train.info())

In [None]:
data_train

In [None]:
preprocess(data_train)

In [None]:
data_train

In [None]:
# feature selection
feat_names = ['f' + str(i) for i in range(1, 6746)]
feats = data_train[feat_names].values
labels = data_train['label'].values
mask = feature_selection(feats, labels)
selected_feats = feats[:, mask]

In [None]:
# label propagation
new_labels = label_propagation(selected_feats, labels)

In [None]:
# train
classifier = train_svm_classifier(selected_feats, new_labels)

In [None]:
# predict
labels_val = classifier.predict(data_val)
pred_df = pd.DataFrame({'id': data_val['id'], 'prob': labels_val})
pred_df.to_csv('result.txt', index=False)