In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from metric_names import metrics
from model_names import models

In [2]:
df = pd.read_csv('data.csv')
df.shape

(604349, 27)

In [3]:
df.isnull().sum().sum()

0

In [4]:
data_types = df.dtypes
data_types

BrukerID         int64
Kjonn           object
Alder          float64
Jobb           float64
Postkode        object
FilmID           int64
Rangering        int64
Tidstempel     float64
Tittel          object
Action           int64
Adventure        int64
Animation        int64
Children         int64
Comedy           int64
Crime            int64
Documentary      int64
Drama            int64
Fantasy          int64
Film-Noir        int64
Horror           int64
Musical          int64
Mystery          int64
Romance          int64
Sci-Fi           int64
Thriller         int64
War              int64
Western          int64
dtype: object

In [5]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
strings = ['object']

num_cols = list(df.select_dtypes(include=numerics).columns)
num_cols.remove('Rangering')
cat_cols = list(df.select_dtypes(include=strings).columns)

In [6]:
df_train, df_test = train_test_split(df, test_size=0.33, random_state=0)
y_train = df_train['Rangering']
X_train = df_train.drop('Rangering', axis=1, inplace=False)
y_test = df_test['Rangering']
X_test = df_test.drop('Rangering', axis=1, inplace=False)

In [7]:
# build a base model 
#import xgboost as xgb
#data_dmatrix = xgb.DMatrix(data=X,label=y)

In [8]:
#xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                #max_depth = 5, alpha = 10, n_estimators = 10)
num_cols

['BrukerID',
 'Alder',
 'Jobb',
 'FilmID',
 'Tidstempel',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [9]:
regressor = models['regression']['ensemble']['RandomForestRegressor']
preprocessor = ColumnTransformer(
            [
                ('encoder', OneHotEncoder(handle_unknown = 'ignore'), cat_cols),
                
                ('normilizer', StandardScaler(), num_cols)
            ]
)

pipeline = Pipeline([
    ('data_preprocessor', preprocessor),
    ('regressor', regressor(n_estimators=20, verbose=3, n_jobs=-1))
    ]
)


#mse = metrics['regression']['mse']
#rmse = np.sqrt(mse(df_test, preds))



In [10]:
pipeline.fit(X_train, y_train)
y_preds = pipeline.predict(X_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


building tree 1 of 20
building tree 2 of 20
building tree 3 of 20
building tree 4 of 20building tree 5 of 20building tree 6 of 20
building tree 7 of 20

building tree 8 of 20
building tree 9 of 20

building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20


[Parallel(n_jobs=-1)]: Done   4 out of  20 | elapsed:  6.0min remaining: 24.1min


building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20


[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:  6.7min remaining:  5.5min
[Parallel(n_jobs=-1)]: Done  18 out of  20 | elapsed: 10.9min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 11.0min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   4 out of  20 | elapsed:    0.2s remaining:    1.1s
[Parallel(n_jobs=12)]: Done  11 out of  20 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=12)]: Done  18 out of  20 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=12)]: Done  20 out of  20 | elapsed:    0.5s finished


In [11]:
#df_test['Rangering']

mse = metrics['regression']['mse']
rmse = np.sqrt(mse(y_test, y_preds))
rmse

1.29221574115593

In [12]:
r2 = metrics['regression']['r2']
r2_score = r2(y_test, y_preds)
r2_score

0.5732355887716511

In [13]:
#from model_pipeline import ModelingPipeline
#mp = ModelingPipeline()
from sklearn.feature_selection import SelectKBest, f_regression
import matplotlib.pyplot as plt
def select_features(X, y, columns):
    model = SelectKBest(score_func=f_regression, k='all')
    model.fit(X, y)
    plt.bar([columns[:-1][i] for i in range(len(model.scores_))], model.scores_)
    from pylab import rcParams
    rcParams['figure.figsize'] = 10, 10
    plt.xlabel('Features')
    plt.ylabel('Score')
    plt.title('Feature importance score')
    plt.show()

In [14]:
#columns =list(df_train.columns)
#select_features(X_train, y_train, columns)