# ML Workshop 1: General ML
# Data Analysis

In [None]:
%load_ext autoreload
%autoreload 2
%pylab inline

from collections import defaultdict, OrderedDict, Counter
import copy
import numpy as np
import pandas as pd
from pprint import pprint
import matplotlib.pyplot as plt
from datetime import date, datetime
import os

# Load and print some data

In [None]:
df = pd.read_csv("data/data.csv")
df

In [None]:
df.shape

# Check data types

In [None]:
df.dtypes

All types as expected, sometimes mixing int and float but it doesn't matter so much.

In [None]:
numerical = [col for col, type_ in df.dtypes.items() if type_ in (np.int64, np.float64)]
categorical = [col for col, type_ in df.dtypes.items() if col not in numerical]

print("numerical", numerical)
print("categorical", categorical)

In [None]:
df.describe()

- Missing data at columns "Engine HP", "Engine Cylinders", "Number of Doors"
- max(Year) = 2017, dataset published on Dec 2016, some data for future
- std > 0 for all columns => not constant columns
- min(Engine Cylinders) = 0 => weird

Don't understand MPG, transform to l/100km

In [None]:
def mpg_to_l100km(mpg):
    return 235.21 / mpg

min_mpg = min(min(df["city mpg"]), min(df["highway MPG"]))
max_mpg =  min(max(df["city mpg"]), max(df["highway MPG"]))

print("Min {}mpg, {} l/100km".format(min_mpg, mpg_to_l100km(min_mpg)))
print("Max {}mpg, {} l/100km".format(max_mpg, mpg_to_l100km(max_mpg)))

0.66 l/100 km strange but might be OK

# Histograms

In [None]:
for col in numerical:
    ax = df[[col]].plot.hist(title=col + " histogram")
    ax.set_xlabel("Value")

- year, Engine HP, MPG, MSRP - Not very nice distribution => data transformation might help

In [None]:
MAX_VALUES_TO_PLOT = 20

for col in categorical:
    value_counts = df[col].value_counts(normalize=True)
    if len(value_counts) <= MAX_VALUES_TO_PLOT:
        plt.figure()
        ax = value_counts.plot(kind='bar')
        ax.set_xlabel("Value")
        ax.set_ylabel("Frequency")
        ax.set_title(col + " hisotgram")
    else:
        print("=== {} ===".format(col))
        print(value_counts)
        print()

- "Market category" - more values, we should split
- "Engine Fuel type" - diesel => not a lot 
- "Driven_wheels" - all_wheel_drive and four_wheel_drive difference??? Maybe we can join to one category

# Correlations

In [None]:
df_corr = df.corr()
df_corr

Last colums tells me the potencial of the signal for prediction

### Get the most correlating pairs

In [None]:
CORR_THRESHOLD = 0.5

most_correlated_pairs = []

for i_col1, (col1, row) in enumerate(df_corr.items()):
    for i_col2, (col2, val) in enumerate(row.items()):
        
        # avoid duplicity and same columns
        if i_col1 <= i_col2:
            continue
            
        if abs(val) > CORR_THRESHOLD:
            most_correlated_pairs.append((col1, col2, val))
            
# Sort
most_correlated_pairs = sorted(most_correlated_pairs, key=lambda x: abs(x[2]), reverse=True)
most_correlated_pairs

- MPG correlation - not a surprise
- Engine HP/cylinders - not a surprise
- MSRP, Engine HP/Cylinders => Engine will probably be the best predictor

In [None]:
for col1, col2, val in most_correlated_pairs:
    df.plot.scatter(x=col1, y=col2, alpha=0.1)

- annomaly in Highway MPG => explore and correct/get rid of it
- most graphs not very linear, data transformation might be good idea
- over 1.5M too few examples => if it makes sense for the task get rid of it

# Clean data

### Highway MPG

In [None]:
df[df["highway MPG"] > 300]

In [None]:
df.loc[1119,"highway MPG"] = 35  # Not using /= 10 due to repetitive script trigger
df.loc[1119, :]

### MSRP

In [None]:
print(df.shape)
df[df["MSRP"] > 1000000]

In [None]:
df = df[df["MSRP"] <= 1000000]
print(df.shape)
df[df["MSRP"] > 1000000]

# Split to train/dev/test set

In [None]:
from sklearn.model_selection import train_test_split
splits = (0.6, 0.2, 0.2)
datasets = {}

train_dev, datasets["test"] = train_test_split(df, test_size=splits[2], random_state=1)
test_size = splits[1] / sum(splits[:2])  # to get 20% of the total, not 20% of  the 80%
datasets["train"], datasets["dev"] = train_test_split(train_dev, test_size=test_size, random_state=1)

for name, dataset in datasets.items():
    print("{} len={}, {}".format(name, len(dataset), len(dataset)/len(df)))

In [None]:
for name, dataset in datasets.items():
    dataset.to_csv("data/data_clean_{}.csv".format(name))