######################################################################

'''    Project No. 1: Mercedes-Benz Greener Manufacturing       '''

######################################################################


#### Step 1 : Import the Required Libraries 

In [1]:
import pandas as pd 
import numpy as np
from sklearn.decomposition import PCA   # for Dimensionality Reduction 

#### Step 2 : Read the data from train.csv

In [2]:
df_train = pd.read_csv("train.csv")

#### Understanding the shape of data 

In [3]:
print('Size of Training Set : {} rows and {} columns' .format(*df_train.shape))

Size of Training Set : 4209 rows and 378 columns


#### Viewing Top 5 Rows in df_train dataset 

In [4]:
df_train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


#### Step 3 : Collect the 'y'  values into array 

#### Here we are seperating the 'y' from the data as we will use this 'y' as to learn this prediction output

In [5]:
y_train = df_train['y'].values 

In [6]:
y_train

array([130.81,  88.53,  76.26, ..., 109.22,  87.48, 110.85])

#### Step 4 : Understand the datatypes we have 

#### iterate through all columns which has 'X' in the name of the column 

In [7]:
cols = [c for c in df_train.columns if 'X' in c ]

print('Number of Features: {}'. format(len(cols)))

print('Feature types:')

df_train[cols].dtypes.value_counts()

Number of Features: 376
Feature types:


int64     368
object      8
dtype: int64

#### Step 5 : Count the data in each of the columns 

In [8]:
counts = [[] ,[] ,[]]
for c in cols :
    type1 = df_train[c].dtype
    unique1 = len(np.unique(df_train[c]))
    if unique1==1:
        counts[0].append(c)
    elif unique1 == 2 and type1 == np.int64:
        counts[1].append(c)
    else:
        counts[2].append(c)

print('Constant Feature: {} , Binary Features: {} , Categorical Features: {}\n' .format(*[len(c) for c in counts]))

print('Constant Features: ',counts[0])
print('Categorical Features: ',counts[2])

Constant Feature: 12 , Binary Features: 356 , Categorical Features: 8

Constant Features:  ['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']
Categorical Features:  ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']


#### Step 6 :  Read the data from test.csv 

In [9]:
df_test = pd.read_csv('test.csv')

#### Remove Columns 'ID'and 'y' from the data as they will not be used for learning 

In [10]:
usable_columns = list(set(df_train.columns) - set(['ID' , 'y']))
y_train = df_train['y'].values
id_test = df_test['ID'].values

x_train = df_train[usable_columns]
x_test = df_test[usable_columns]

#### Step 7 : Check for unique and null values for test and train set 

In [11]:
def check_missing_values(df):
    if df.isnull().any().any():
        print('There are missing values in the Dataframe')
    else:
        print('There are no missing values in a Dataframe')
        

In [12]:
check_missing_values(x_train)
check_missing_values(x_test)

There are no missing values in a Dataframe
There are no missing values in a Dataframe


#### Step 8 : If for any column(s) , the variance is equal to 0 

#### We need to remove those Variable(s)

#### And Then apply LABEL ENCODER

In [13]:
for column in usable_columns:
    cardinality = len(np.unique(x_train[column]))
    if cardinality == 1:
        x_train.drop(column , axis=1)   # column with only 1 value is useless , so we drop it 
        x_test.drop(column , axis =1)   
    
    if cardinality > 2 :    # Column is Categorical 
        mapper = lambda x: sum([ord(digit) for digit in x])
        x_train[column] = x_train[column].apply(mapper)
        x_test[column] = x_test[column].apply(mapper)
        
x_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train[column] = x_train[column].apply(mapper)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test[column] = x_test[column].apply(mapper)


Unnamed: 0,X243,X202,X352,X126,X304,X265,X253,X289,X101,X282,...,X208,X163,X280,X54,X284,X127,X266,X283,X61,X327
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1,0,0,0,0,1,1,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
2,0,0,0,0,1,0,0,0,1,0,...,0,1,0,1,0,0,0,0,1,0
3,0,0,0,0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
4,0,0,0,0,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0


#### Step 9 : Make sure the data is now changed into numericals 

In [14]:
print('Feature Types: ')

x_train[cols].dtypes.value_counts()

Feature Types: 


int64    376
dtype: int64

#### Step 10 : Perform Dimensionality Reduction 

#### Linear Dimensionality Reduction using Single Value Decomposition of the data to project it to a lower dimensional space. 

In [15]:
n_comp = 12 
pca = PCA(n_components = n_comp , random_state=420)
pca2_results_train = pca.fit_transform(x_train)
pca2_results_test = pca.transform(x_test)


#### Step 11 : Training using XGBOOST

In [16]:
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [17]:
x_train,x_valid,y_train,y_valid = train_test_split(pca2_results_train, y_train , test_size = 0.2 , random_state=4242)

In [18]:
d_train = xgb.DMatrix(x_train, label = y_train)
d_valid = xgb.DMatrix(x_valid , label = y_valid)
d_test = xgb.DMatrix(pca2_results_test)

In [19]:
params = {}
params['objective'] = 'reg:linear'
params['eta'] = 0.02
params['max_depth'] = 4

In [20]:
params  # its a Dictionary 

{'objective': 'reg:linear', 'eta': 0.02, 'max_depth': 4}

In [21]:
def xgb_r2_score(preds ,dtrain):
        labels = dtrain.get_label()
        return 'r2' ,r2_score(labels,preds)

In [22]:
watchlist = [(d_train, 'train') , (d_valid,'valid')]

In [23]:
watchlist

[(<xgboost.core.DMatrix at 0x1b161337460>, 'train'),
 (<xgboost.core.DMatrix at 0x1b161337400>, 'valid')]

In [24]:
clf = xgb.train(params ,d_train ,1000 ,watchlist ,early_stopping_rounds=50 ,feval =xgb_r2_score ,maximize=True ,verbose_eval=10)

[0]	train-rmse:99.14835	train-r2:-58.35295	valid-rmse:98.26297	valid-r2:-67.63754
[10]	train-rmse:81.27653	train-r2:-38.88428	valid-rmse:80.36433	valid-r2:-44.91014
[20]	train-rmse:66.71610	train-r2:-25.87403	valid-rmse:65.77334	valid-r2:-29.75260
[30]	train-rmse:54.86915	train-r2:-17.17724	valid-rmse:53.89120	valid-r2:-19.64513
[40]	train-rmse:45.24563	train-r2:-11.36018	valid-rmse:44.22231	valid-r2:-12.90160
[50]	train-rmse:37.44742	train-r2:-7.46672	valid-rmse:36.37773	valid-r2:-8.40705
[60]	train-rmse:31.15105	train-r2:-4.85891	valid-rmse:30.01780	valid-r2:-5.40531
[70]	train-rmse:26.08768	train-r2:-3.10906	valid-rmse:24.90852	valid-r2:-3.41040
[80]	train-rmse:22.04898	train-r2:-1.93527	valid-rmse:20.82563	valid-r2:-2.08304
[90]	train-rmse:18.84731	train-r2:-1.14472	valid-rmse:17.59529	valid-r2:-1.20077
[100]	train-rmse:16.33664	train-r2:-0.61138	valid-rmse:15.07634	valid-r2:-0.61575
[110]	train-rmse:14.39753	train-r2:-0.25155	valid-rmse:13.14553	valid-r2:-0.22839
[120]	train-rmse:

#### Step 12: Predict your test_df values using xgboost

In [25]:
p_test = clf.predict(d_test)

In [26]:
p_test

array([ 83.01027,  97.86023,  83.30593, ...,  98.83134, 107.41438,
        96.70509], dtype=float32)

In [27]:
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = p_test
sub.to_csv('xgb.csv' , index=False)



In [28]:
sub.head()

Unnamed: 0,ID,y
0,1,83.010269
1,2,97.860229
2,3,83.305931
3,4,77.200882
4,5,112.303635


######################################################################

    '''                          End                        '''

######################################################################