In [17]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures


In [18]:
MODEL_DEGREE=2

In [19]:
data = pd.read_csv('assignment2dataset.csv')
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  object 
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


In [20]:
X=data.drop('Performance Index', axis=1,)#Features
Y=data['Performance Index'] #Label
#Split the data to training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20,shuffle=True,random_state=10)


In [21]:
le=LabelEncoder()
X_train['Extracurricular Activities'] = le.fit_transform(X_train['Extracurricular Activities'])
X_train.info()


<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 7496 to 1289
Data columns (total 5 columns):
 #   Column                            Non-Null Count  Dtype
---  ------                            --------------  -----
 0   Hours Studied                     8000 non-null   int64
 1   Previous Scores                   8000 non-null   int64
 2   Extracurricular Activities        8000 non-null   int64
 3   Sleep Hours                       8000 non-null   int64
 4   Sample Question Papers Practiced  8000 non-null   int64
dtypes: int64(5)
memory usage: 375.0 KB


In [22]:
train_data = pd.concat([X_train, y_train], axis=1)

In [23]:
#Get the correlation between the features
corr = train_data.corr()
#Top 50% Correlation training features with the Value
top_feature = corr.index[abs(corr['Performance Index'])>0.25]
#Correlation plot
top_feature = top_feature.delete(-1)
X_train = X_train[top_feature]
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   Hours Studied                     10000 non-null  int64 
 1   Previous Scores                   10000 non-null  int64 
 2   Extracurricular Activities        10000 non-null  object
 3   Sleep Hours                       10000 non-null  int64 
 4   Sample Question Papers Practiced  10000 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 390.8+ KB


In [24]:
def PolynomialFeatures(X, degree):
    # transforms the existing features to higher degree features.
    originalFeatures =list(X.columns)

    for i in range(degree-1):
        columns =list(X.columns)
        for feature1 in list(columns):
            for feature2 in list(originalFeatures):
                newFeature = feature1 +"*"+feature2
                newFeatureReversed = feature2 + "*" + feature1
                if newFeature not in X and newFeatureReversed not in X:
                    #X.insert(X.shape[1],newFeature,0)
                    newColumn=X.loc[:,feature1]*X.loc[:,feature2]
                    X = pd.concat([X,newColumn], axis=1)
                    X.columns.values[-1]= newFeature
    return X
X_train_poly = PolynomialFeatures(X_train, degree=MODEL_DEGREE)


In [25]:
X_train_poly.head()

Unnamed: 0,Hours Studied,Previous Scores,Hours Studied*Hours Studied,Hours Studied*Previous Scores,Previous Scores*Previous Scores
7496,3,58,9,174,3364
7694,4,82,16,328,6724
9248,4,96,16,384,9216
2470,3,57,9,171,3249
4392,2,44,4,88,1936


In [26]:
# fit the transformed features to Linear Regression
poly_model = linear_model.LinearRegression()
poly_model.fit(X_train_poly, y_train)
# predicting on training data-set
y_train_predicted = poly_model.predict(X_train_poly)

print("Model Training Error:" + str(mean_squared_error(y_train, y_train_predicted)))


Model Training Error:5.188894452140708


In [27]:
#Test data preprocessing
X_test['Extracurricular Activities'] = le.fit_transform(X_test['Extracurricular Activities'])
X_test = X_test[top_feature]

X_test_poly = PolynomialFeatures(X_test, degree=MODEL_DEGREE)

# predicting on test data-set
y_test_predicted = poly_model.predict(X_test_poly)

print("Model Test Error:" + str(mean_squared_error(y_test, y_test_predicted)))


Model Test Error:5.308525471376886
