In [1]:
import os
os.chdir("..")
import pandas as pd
import numpy as np
from scipy.stats import truncnorm

In [2]:
from utils.data import Dataset
from utils.completer import complete_by_mean_col
from utils.generator import gen_complete_random

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [4]:
def create_data(size=1000, corr=0.5):
    Y = np.random.randint(0, 2, (size,)).astype(np.float32)

    col0 = np.random.randint(0, 2, (size,)).astype(np.float32)

    vec1 = np.random.randint(0, 2, (size,)).astype(np.float32)
    col1 = corr*(col0/np.linalg.norm(col0)) + (1-corr**2)**0.5*(vec1/np.linalg.norm(vec1))

    lower, upper = 0.1, 10.0
    mean = 2.0
    sd = 0.5
    col2 = np.random.randn() * Y + truncnorm((lower - mean) / sd, (upper - mean) / sd, loc=mean, scale=sd).rvs(size)
    col3 = np.random.randn() * Y + truncnorm((lower - mean) / sd, (upper - mean) / sd, loc=mean, scale=sd).rvs(size)
    col4 = np.random.randn() * Y + truncnorm((lower - mean) / sd, (upper - mean) / sd, loc=mean, scale=sd).rvs(size)
    
    X = pd.DataFrame(np.column_stack((col0, col1, col2, col3, col4)))

    return X, Y

In [5]:
X_data, Y_data = create_data()
X_data.corr()

Unnamed: 0,0,1,2,3,4
0,1.0,0.496746,0.038494,0.001687,-0.02179
1,0.496746,1.0,0.047532,0.022929,0.006904
2,0.038494,0.047532,1.0,-0.27985,-0.059029
3,0.001687,0.022929,-0.27985,1.0,0.060634
4,-0.02179,0.006904,-0.059029,0.060634,1.0


In [6]:
pd.concat([X_data, pd.DataFrame(Y_data, columns=["_TARGET_"])], axis=1).corr()

Unnamed: 0,0,1,2,3,4,_TARGET_
0,1.0,0.496746,0.038494,0.001687,-0.02179,0.015265
1,0.496746,1.0,0.047532,0.022929,0.006904,0.018243
2,0.038494,0.047532,1.0,-0.27985,-0.059029,0.662349
3,0.001687,0.022929,-0.27985,1.0,0.060634,-0.433767
4,-0.02179,0.006904,-0.059029,0.060634,1.0,-0.021018
_TARGET_,0.015265,0.018243,0.662349,-0.433767,-0.021018,1.0


In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.3)

In [8]:
clf = LogisticRegression()
clf.fit(X_train, Y_train)
clf.score(X_test, Y_test)

0.8566666666666667

In [9]:
clf = KNeighborsClassifier()
clf.fit(X_train, Y_train)
clf.score(X_test, Y_test)

0.82

In [10]:
clf = LinearSVC(max_iter=10000)
clf.fit(X_train, Y_train)
clf.score(X_test, Y_test)

0.8566666666666667

In [11]:
clf = RandomForestClassifier()
clf.fit(X_train, Y_train)
clf.score(X_test, Y_test)

0.81