# ML CUP - Data Exploration
In this notebook we perform a simple preliminary visualization of data.
We want to understand the range and the distribution of the dataset we are daling with, in order to make better choiches in the following analysis.

In [None]:
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score, KFold
from itertools import product
import matplotlib.pyplot as plt

In [None]:
SMALL_SIZE = 16
MEDIUM_SIZE = 18
BIGGER_SIZE = 20

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)   # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)   # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('axes', titlesize=MEDIUM_SIZE)    # fontsize of the figure suptitle
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

# Load the dataset

In [None]:
from sklearn.model_selection import train_test_split

# load the dataset, split into input (X) and output (y) variables
dataset = np.loadtxt('ML-CUP23-TR.csv', delimiter=',')
X = dataset[:,1:11]
y = dataset[:,11:14]

# Split the data into training and testing sets (80%/20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Explore input data using box plots and histograms

In [None]:
import seaborn as sns
data = [X_train[:, i] for i in range(10)]

plt.figure(figsize=(10, 6))
plt.boxplot(data, patch_artist=True,
            boxprops=dict(facecolor='lightcyan'),
            medianprops=dict(color='teal'),
            whiskerprops=dict(color='grey'),
            capprops=dict(color='darkslategray'),
            flierprops=dict(markerfacecolor='gainsboro', marker='o'))

plt.title('Box Plots of 10 input variables')
plt.ylabel('Value')
plt.xlabel('Input Variable #')
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

In [None]:
plt.figure(2,figsize=(20, 8))
plt.suptitle('Histograms of the input data')
for i in range(10):
    plt.subplot(2,5,i+1)
    plt.xlabel(f'x{i}')
    plt.hist(X_train[:, i], color='lightblue')