# Pre-processing & Training Data 

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats

### Load Data

In [25]:
df = pd.read_csv('data/anime_df.csv', index_col=0)
df.head()

Unnamed: 0,anime_id,name,type,episodes,rating,members,Action,Adventure,Cars,Comedy,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,32281,Kimi no Na wa.,Movie,1.0,9.37,200630,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,TV,64.0,9.26,793665,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28977,Gintama°,TV,51.0,9.25,114262,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,9253,Steins;Gate,TV,24.0,9.17,673572,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,9969,Gintama&#039;,TV,51.0,9.16,151266,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


### Get Dummy

In [30]:
df_dummy = pd.get_dummies(df, columns=['type'], prefix="type_")
df_dummy.head()

Unnamed: 0,anime_id,name,episodes,rating,members,Action,Adventure,Cars,Comedy,Dementia,...,Thriller,Vampire,Yaoi,Yuri,type__Movie,type__Music,type__ONA,type__OVA,type__Special,type__TV
0,32281,Kimi no Na wa.,1.0,9.37,200630,0,0,0,0,0,...,0,0,0,0,True,False,False,False,False,False
1,5114,Fullmetal Alchemist: Brotherhood,64.0,9.26,793665,1,1,0,0,0,...,0,0,0,0,False,False,False,False,False,True
2,28977,Gintama°,51.0,9.25,114262,1,0,0,1,0,...,0,0,0,0,False,False,False,False,False,True
3,9253,Steins;Gate,24.0,9.17,673572,0,0,0,0,0,...,1,0,0,0,False,False,False,False,False,True
4,9969,Gintama&#039;,51.0,9.16,151266,1,0,0,1,0,...,0,0,0,0,False,False,False,False,False,True


### Normalize

In [32]:
from sklearn.preprocessing import MinMaxScaler


In [47]:
scaler_columns = ['episodes', 'members']
scaler = MinMaxScaler()

# Fit and transform the data
scaled_episodes, scaled_members = scaler.fit_transform(df[scaler_columns]).T
df['scaled_episodes'] = scaled_episodes
df['scaled_members'] = scaled_members
df = df.drop(columns=['anime_id', 'name', 'type', 'episodes', 'members'])
df.head()

Unnamed: 0,rating,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,...,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri,scaled_episodes,scaled_members
0,9.37,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0.0,0.197867
1,9.26,1,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0.034673,0.782769
2,9.25,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.027518,0.112683
3,9.17,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0.012658,0.664323
4,9.16,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.027518,0.14918


### Split data into train & test

In [54]:
from sklearn.model_selection import train_test_split

In [64]:
X = df.drop('rating', axis=1).values
y = df[['rating']].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [66]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7952, 45), (3917, 45), (7952, 1), (3917, 1))