In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import os
import pickle

In [None]:
import sys
sys.path.append('./')

In [None]:
from src.visualization import plot_data

In [None]:
!head -5 data/candy-data.csv

In [None]:
data = pd.read_csv('data/candy-data.csv', index_col='competitorname')

# Data overview

Shape of the data:

In [None]:
data.shape

Sample records:

In [None]:
data.head()

In [None]:
cat_columns = [
    'chocolate',
    'fruity',
    'caramel',
    'peanutyalmondy',
    'nougat',
    'crispedricewafer',
    'hard',
    'bar',
    'pluribus'
]

In [None]:
for column_name in cat_columns:
        data[column_name] = data[column_name].astype('category')

Basic statistics:

In [None]:
data.describe(include='all')

Mind that `freq` is count in fact. At least for `pandas==0.25.3`.

Counts of non-NA entries:

In [None]:
data.count()

Dependence of percent on wins on percent of sugar:

In [None]:
properties = {
    'x': "sugarpercent",
    'y': "winpercent",
    'x_label': "Percent of sugar",
    'y_label': "Percent of wins",
    'category': "chocolate",
    'category_labels': {
        0: "Chocolate candy",
        1: "Chocolate free candy",
    }
}

plot_data(data, properties)

Dependence of percent on wins on price:

In [None]:
properties = {
    'x': "pricepercent",
    'y': "winpercent",
    'x_label': "Price",
    'y_label': "Percent of wins",
    'category': "chocolate",
    'category_labels': {
        0: "Chocolate candy",
        1: "Chocolate free candy",
    }
}

plot_data(data, properties)

In [None]:
for column_name in data.select_dtypes(include=['category']).columns:
    data[column_name] = data[column_name].astype('int')

In [None]:
cols_to_normalize = [
    'sugarpercent',
    'pricepercent',
    'winpercent',
]
scaler = StandardScaler() # MinMaxScaler()
data[cols_to_normalize] = scaler.fit_transform(data[cols_to_normalize])

In [None]:
df_train, df_test = train_test_split(
    data,
    test_size=0.2,
    stratify=data["chocolate"]
)

In [None]:
df_train = torch.tensor(df_train.values, dtype=torch.float)
x_train, y_train = df_train[:, 1:], df_train[:, 0]

In [None]:
df_test = torch.tensor(df_test.values, dtype=torch.float)
x_test, y_test = df_test[:, 1:], df_test[:, 0]

In [None]:
os.makedirs('./data/', exist_ok=True)
with open('./data/logistic-regression.pkl', 'wb') as f:
    pickle.dump({
        'x_train': x_train,
        'y_train': y_train,
        'x_test': x_test,
        'y_test': y_test
    }, f)