In [1]:
#imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer

In [2]:
#load data
df = pd.read_csv('df.csv', index_col=0)
df.head()

Unnamed: 0,Brand,Retail Price,Shoe Size,Days_after_release,ROI
0,0,220,11.0,342,398.636364
1,0,220,11.0,282,211.363636
2,0,220,11.0,282,213.636364
3,0,220,11.5,282,388.636364
4,0,220,11.0,202,276.363636


In [3]:
#Round return on investment (ROI) to 2 decimal places
df['ROI'] = df['ROI'].round(2)
df.head()

Unnamed: 0,Brand,Retail Price,Shoe Size,Days_after_release,ROI
0,0,220,11.0,342,398.64
1,0,220,11.0,282,211.36
2,0,220,11.0,282,213.64
3,0,220,11.5,282,388.64
4,0,220,11.0,202,276.36


In [4]:
#Bin Day_after_release column for better modeling
bins = [-np.inf, 14, 60, 365, np.inf]
labels = ['2 weeks', '2 months', '1 year', 'Over 1 year']
df['Days_after_release'] = pd.cut(df['Days_after_release'], bins, labels = labels)
df.head()

Unnamed: 0,Brand,Retail Price,Shoe Size,Days_after_release,ROI
0,0,220,11.0,1 year,398.64
1,0,220,11.0,1 year,211.36
2,0,220,11.0,1 year,213.64
3,0,220,11.5,1 year,388.64
4,0,220,11.0,1 year,276.36


In [5]:
#One-hot encode the Days_after_release feature
df = pd.get_dummies(df, columns=['Days_after_release'])
df.head()

Unnamed: 0,Brand,Retail Price,Shoe Size,ROI,Days_after_release_2 weeks,Days_after_release_2 months,Days_after_release_1 year,Days_after_release_Over 1 year
0,0,220,11.0,398.64,0,0,1,0
1,0,220,11.0,211.36,0,0,1,0
2,0,220,11.0,213.64,0,0,1,0
3,0,220,11.5,388.64,0,0,1,0
4,0,220,11.0,276.36,0,0,1,0


In [6]:
#Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='ROI'), df.ROI, test_size=0.2, random_state=350)

In [7]:
X_train.shape, X_test.shape

((71863, 7), (17966, 7))

In [8]:
y_train.shape, y_test.shape

((71863,), (17966,))

In [9]:
X_train.dtypes

Brand                               int64
Retail Price                        int64
Shoe Size                         float64
Days_after_release_2 weeks          uint8
Days_after_release_2 months         uint8
Days_after_release_1 year           uint8
Days_after_release_Over 1 year      uint8
dtype: object

In [10]:
#Scale data using StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
names = ['Brand', 'Retail Price', 'Shoe Size', 'Days_after_release_2 weeks', 'Days_after_release_2 months', 'Days_after_release_1 year', 'Days_after_release_Over 1 year']
X_train_scaled = pd.DataFrame(X_train_scaled, columns=names)
X_train_scaled.head()

Unnamed: 0,Brand,Retail Price,Shoe Size,Days_after_release_2 weeks,Days_after_release_2 months,Days_after_release_1 year,Days_after_release_Over 1 year
0,-0.58719,0.424191,-2.279876,-0.525917,1.793618,-0.618583,-0.607326
1,-0.58719,0.424191,0.720366,-0.525917,-0.557532,1.616597,-0.607326
2,-0.58719,0.424191,0.720366,1.901441,-0.557532,-0.618583,-0.607326
3,-0.58719,0.424191,0.29176,-0.525917,-0.557532,-0.618583,1.646562
4,-0.58719,0.424191,1.148972,-0.525917,-0.557532,-0.618583,1.646562


In [11]:
#Scale test set
X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=names)
X_test_scaled.head()

Unnamed: 0,Brand,Retail Price,Shoe Size,Days_after_release_2 weeks,Days_after_release_2 months,Days_after_release_1 year,Days_after_release_Over 1 year
0,-0.58719,0.424191,1.148972,-0.525917,-0.557532,-0.618583,1.646562
1,-0.58719,0.424191,2.006184,-0.525917,-0.557532,-0.618583,1.646562
2,-0.58719,0.424191,-1.208361,-0.525917,-0.557532,-0.618583,1.646562
3,-0.58719,0.424191,0.720366,1.901441,-0.557532,-0.618583,-0.607326
4,-0.58719,0.424191,0.077457,-0.525917,-0.557532,1.616597,-0.607326


In [12]:
#Scale data using MinMaxScaler
MMScaler = MinMaxScaler()
MMScaler.fit(X_train)
X_train_mmscaled = MMScaler.transform(X_train)
X_train_mmscaled = pd.DataFrame(X_train_mmscaled, columns=names)
X_train_mmscaled.head()

Unnamed: 0,Brand,Retail Price,Shoe Size,Days_after_release_2 weeks,Days_after_release_2 months,Days_after_release_1 year,Days_after_release_Over 1 year
0,0.0,0.75,0.037037,0.0,1.0,0.0,0.0
1,0.0,0.75,0.555556,0.0,0.0,1.0,0.0
2,0.0,0.75,0.555556,1.0,0.0,0.0,0.0
3,0.0,0.75,0.481481,0.0,0.0,0.0,1.0
4,0.0,0.75,0.62963,0.0,0.0,0.0,1.0


In [13]:
#MMScaler on test set
X_test_mmscaled = MMScaler.transform(X_test)
X_test_mmscaled = pd.DataFrame(X_test_mmscaled, columns=names)
X_test_mmscaled.head()

Unnamed: 0,Brand,Retail Price,Shoe Size,Days_after_release_2 weeks,Days_after_release_2 months,Days_after_release_1 year,Days_after_release_Over 1 year
0,0.0,0.75,0.62963,0.0,0.0,0.0,1.0
1,0.0,0.75,0.777778,0.0,0.0,0.0,1.0
2,0.0,0.75,0.222222,0.0,0.0,0.0,1.0
3,0.0,0.75,0.555556,1.0,0.0,0.0,0.0
4,0.0,0.75,0.444444,0.0,0.0,1.0,0.0


In [14]:
#PowerTransformer on y
pow_trans = PowerTransformer()
pow_trans.fit(y_train.to_numpy().reshape(-1, 1))
y_train_trans = pow_trans.transform(y_train.to_numpy().reshape(-1, 1))
y_train_trans = pd.DataFrame(y_train_trans, columns=['ROI'])
y_train_trans.head()

Unnamed: 0,ROI
0,-1.214983
1,0.032613
2,-0.91682
3,0.639804
4,-1.340153


In [15]:
#PowerTransformer on y_test
y_test_trans = pow_trans.transform(y_test.to_numpy().reshape(-1, 1))
y_test_trans = pd.DataFrame(y_test_trans, columns=['ROI'])
y_test_trans.head()

Unnamed: 0,ROI
0,-0.704178
1,-0.533926
2,-0.762239
3,-0.676355
4,-1.471094


In [16]:
#Save preprocessed data
X_train.to_csv('X_train')
X_test.to_csv('X_test')
y_train.to_csv('y_train')
y_test.to_csv('y_test')
X_train_scaled.to_csv('X_train_scaled')
X_test_scaled.to_csv('X_test_scaled')
X_train_mmscaled.to_csv('X_train_mmscaled')
X_test_mmscaled.to_csv('X_test_mmscaled')
y_train_trans.to_csv('y_train_trans')
y_test_trans.to_csv('y_test_trans')