Splitting data
---

In [None]:
import pandas as pd

# Load the data
data_df = pd.read_csv('electric-cars.csv')

# First five rows
data_df.head()

In [None]:
import numpy as np

# Generate a list of indexes
n = len(data_df)
indexes = np.arange(n)

print('indexes:', indexes)

In [None]:
# Split into train/test indexes
split_idx = int(n*0.7)
train_idx = indexes[:split_idx]
test_idx = indexes[split_idx:]

print('train indexes:', train_idx)
print('test indexes:', test_idx)

In [None]:
# Extract x, y data
x = data_df.iloc[:, 0].values # Temperatures
y = data_df.iloc[:, 1].values # Consumption

# Split data
x_tr, y_tr = x[train_idx], y[train_idx]
x_te, y_te = x[test_idx], y[test_idx]

print('train:', x_tr.shape, y_tr.shape)
print('test:', x_te.shape, y_te.shape)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

# Plot data points
plt.scatter(x_tr, y_tr, label='train set')
plt.scatter(x_te, y_te, label='test set')
plt.xlabel('temperature')
plt.ylabel('number of users')
plt.legend()
plt.show()

In [None]:
# shuffle the indexes
np.random.shuffle(indexes)

print('indexes:', indexes) # result depends on the seed

In [None]:
# Create the indexes
indexes = np.arange(n)

# Shuffle the indexes
np.random.seed(0)
np.random.shuffle(indexes)

print('indexes:', indexes) # [35 34 41 27 11 ..]

In [None]:
# Split into train/test indexes
split_idx = int(n*0.7)
train_idx = indexes[:split_idx]
test_idx = indexes[split_idx:]

# Split data
x_tr, y_tr = x[train_idx], y[train_idx]
x_te, y_te = x[test_idx], y[test_idx]

# Plot data points
plt.scatter(x_tr, y_tr, label='train set')
plt.scatter(x_te, y_te, label='test set')
plt.xlabel('temperature')
plt.ylabel('number of users')
plt.legend()
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

# Split data
x_tr, x_te, y_tr, y_te = train_test_split(
    x, y, train_size=0.7, test_size=0.3, random_state=0)

In [None]:
from sklearn.model_selection import train_test_split

# Split data
x_tr, x_te, y_tr, y_te = train_test_split(
    x, y, train_size=39, test_size=18, random_state=0)

In [None]:
from sklearn.metrics import mean_squared_error as mse

# Polynomial regressions of degree 5 and 10
coef5 = np.polyfit(x_tr, y_tr, deg=5)
coef10 = np.polyfit(x_tr, y_tr, deg=10)

# Evaluate performance
y_pred5_tr = np.polyval(coef5, x_tr)
y_pred5_te = np.polyval(coef5, x_te)

print('Degree 5 MSE: {:.0f} (train) vs {:.0f} (test)'.format(
    mse(y_tr, y_pred5_tr),
    mse(y_te, y_pred5_te)
))
# Degree 5 MSE: 719 (train) vs 651 (test)

y_pred10_tr = np.polyval(coef10, x_tr)
y_pred10_te = np.polyval(coef10, x_te)

print('Degree 10 MSE: {:.0f} (train) vs {:.0f} (test)'.format(
    mse(y_tr, y_pred10_tr),
    mse(y_te, y_pred10_te)
))
# Degree 10 MSE: 707 (train) vs 1218 (test)