In [312]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Reading of the CSV file into a dataframe.

In [None]:
train_csv = pd.read_csv('train.csv')
train_csv

PREPROCESSING:
1) Removing duplicate rows.
2) The columns where the null value percentage is less than 5% we should remove the rows where these null values are present.
3) The columns where the null value percentage is more than equal to 5% and less than 30%, we should use imputation techniques to remove null values.
4) The columns where the null value percentage is more than equal to 30%, we should drop the entire columns because a large amount of the columns are null values.

In [314]:
train_csv.drop_duplicates(inplace=True)

In [None]:
train_csv

In [None]:
train_csv.isna().sum()

In [None]:
null_value_percentages=(train_csv.isna().sum()/train_csv.shape[0])*100
null_value_percentages

In [None]:
rows_to_drop=null_value_percentages[null_value_percentages<5].sort_values(ascending=False)
rows_to_drop

In [None]:
rows_to_drop = rows_to_drop.keys()
rows_to_drop

In [None]:
for row in rows_to_drop:
	if(null_value_percentages[row]<5):
		train_csv.drop(labels=train_csv.index[train_csv[row].isna()],inplace=True)
train_csv

OBSERVATION: There are 9 rows for which null value percentage is less than 5% and  more than 0%.

In [None]:
columns_to_drop=null_value_percentages[null_value_percentages>30]
columns_to_drop

OBSERVATION: There are no columns for which null value percentage is more than 30%.
As there were no columns dropped, there are no chances of having duplicates.

In [None]:
null_value_percentages=(train_csv.isna().sum()/train_csv.shape[0])*100
null_value_percentages=null_value_percentages[null_value_percentages>0]
columns_to_impute=null_value_percentages.keys()
columns_to_impute

In [323]:
for column in columns_to_impute:
	if(train_csv[column].dtype== object):
		print("Column: ",column,"\tCounts:\n", train_csv[column].value_counts(dropna=False))
		print("\n\n")

OBSERVATION: There are no categorical variables according to the code above. However, Feature2 is a categorical variable which has boolean values i.e. TRUE and FALSE. I will focus on Feature2 particularly.

In [None]:
train_csv['Feature2'].isna().sum()

OBSERVATION: Thus, there are 0 null values in the column 'Feature2'.
It has TRUE or FALSE values. Thus, to convert the categorical variable into a numerical variable, I will assign TRUE = 1 and FALSE = 0.
All the categorical variables must be converted to numerical variables before finding the best model.

In [None]:
# Using apply with a lambda function
train_csv['Feature2'] = train_csv['Feature2'].apply(lambda x: 1 if x == True else 0)
train_csv['Feature2']

# from sklearn.preprocessing import OneHotEncoder
# encoder = OneHotEncoder()
# categorical_features = encoder.fit_transform(train_csv['Feature2'])
# train_csv['Feature2'] = categorical_features
# train_csv['Feature2']

Now, all the categorical variables have been converted into numerical variables.
Now I have to handle columns where the null value percentage is more than 5% and less than 30%.

In [None]:
null_value_percentages=(train_csv.isna().sum()/train_csv.shape[0])*100
null_value_percentages=null_value_percentages[null_value_percentages>0]
null_value_percentages

Mean imputation: Best for numerical data that is symmetrically distributed (i.e., normally distributed) without outliers. 
Median imputation: Best for numerical data that is skewed (non-symmetric) or contains outliers. 
Mode imputation: Best for categorical data or numerical data with discrete, frequently occurring values.

In [None]:
sns.boxplot(train_csv['Feature4'])
plt.show()

As we can see there are no outliers in Feature4 and it is a numerical variable. Thus, I will impute the values using mean.

In [None]:
mean_value = train_csv['Feature4'].mean()
train_csv['Feature4'] = train_csv['Feature4'].fillna(mean_value)
train_csv['Feature4'].isna().sum()

I will keep all the categorical variables as the initial columns and the numerical variables as the columns in the last.
This will help me to standardize and scale only the numerical variables.

In [329]:
# Columns to swap
col1, col2 = 'Feature1', 'Feature2'

# Swap the columns
columns = list(train_csv.columns)  # Get the list of column names
col1_index, col2_index = columns.index(col1), columns.index(col2)  # Find the indices of the columns

# Swap the columns in the list
columns[col1_index], columns[col2_index] = columns[col2_index], columns[col1_index]

# Reorder DataFrame columns based on the modified list
train_csv = train_csv[columns]

Now, I will split the given data into the features and the label.

In [None]:
X = train_csv.iloc[:,:-1]
Y = train_csv.iloc[:,-1]
print(X.head())
print(Y.head())

Splitting the data into 70% for training and 30% for testing with random seed as 42.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
print(X_train.head())
print(X_test.head())

The X_train should be normalized (fitted and transformed). Then, using the same scaler we have to transform (only transform, not fit) the X_test.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# X_train[:, 1:] = scaler.fit_transform(X_train[:, 1:])
# X_test[:, 1:] = scaler.transform(X_test[:, 1:])
# X_test
# AVOID STANDARD SCALING ON CATEGORICAL VARIABLES.

# Use .iloc for indexing when working with DataFrames
# This selects all rows and columns starting from the second column (index 1)
X_train_scaled = scaler.fit_transform(X_train.iloc[:, 1:])
X_test_scaled = scaler.transform(X_test.iloc[:, 1:])

# If you need to reassign back to the original DataFrame or join with unscaled data:
X_train.iloc[:, 1:] = X_train_scaled
X_test.iloc[:, 1:] = X_test_scaled

print(X_train.head())
print(X_test.head())

Flatten the data for plotting.

In [333]:
# Sort the data points for plotting

#np.argsort() takes 1D arrays
#returns indices that sort X_train in ascending order
# sorted_indices = np.argsort(X_train.flatten())

#X_train is sorted as per the indices
#Y_train is also sorted as per the same indices, so that X-Y values align
# X_train_sorted = X_train[sorted_indices]
# Y_train_sorted = Y_train[sorted_indices]

Model Training

In [None]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(X_train, Y_train)
print(regression.coef_)

print(regression.intercept_)

print(regression.get_params())

In [None]:
reg_pred = regression.predict(X_test)

plt.scatter(x=Y_test, y=reg_pred)
plt.xlabel('Y test data')
plt.ylabel('Predicted values by the linear regression ')
# Plotting the prediction vs Y test data. It should ideally resemble y = x line.
plt.show()

Finding the residuals.

In [None]:
residuals = Y_test - reg_pred
residuals

Now, I will find the mean squared error, mean absolute error and the square root of mean squared error.

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

print(mean_squared_error(Y_test, reg_pred))
print(mean_absolute_error(Y_test, reg_pred))
print(np.sqrt(mean_squared_error(Y_test, reg_pred)))

R square and adjusted R square

In [None]:
from sklearn.metrics import r2_score
score = r2_score(y_true=Y_test, y_pred=reg_pred)
print(score)

In [None]:
adjusted_r2 = 1 - (1 - score)*((len(Y_test) - 1)/(len(Y_test) - X_test.shape[1] - 1))
print(adjusted_r2)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# X_train[:, 1:] = scaler.fit_transform(X_train[:, 1:])
# X_test[:, 1:] = scaler.transform(X_test[:, 1:])
# X_test
# AVOID STANDARD SCALING ON CATEGORICAL VARIABLES.

# Use .iloc for indexing when working with DataFrames
# This selects all rows and columns starting from the second column (index 1)
X_train_scaled = scaler.fit_transform(X_train.iloc[:, 1:])
X_test_scaled = scaler.transform(X_test.iloc[:, 1:])

# If you need to reassign back to the original DataFrame or join with unscaled data:
X_train.iloc[:, 1:] = X_train_scaled
X_test.iloc[:, 1:] = X_test_scaled

print(X_train.head())
print(X_test.head())

READING OF TESTING CSV FILE AND PREPROCESSING OF THE DATA READ

In [None]:
test_csv = pd.read_csv('test.csv')
test_csv

In [342]:
test_csv.drop_duplicates(inplace=True)

In [None]:
test_csv.isna().sum()

In [None]:
# Columns to swap
col1, col2 = 'Feature1', 'Feature2'

# Swap the columns
columns = list(test_csv.columns)  # Get the list of column names
col1_index, col2_index = columns.index(col1), columns.index(col2)  # Find the indices of the columns

# Swap the columns in the list
columns[col1_index], columns[col2_index] = columns[col2_index], columns[col1_index]

# Reorder DataFrame columns based on the modified list
test_csv = test_csv[columns]
test_csv

In [None]:
test_csv['Feature2'] = test_csv['Feature2'].apply(lambda x: 1 if x == True else 0)

test_csv.drop(columns=['id'], inplace=True)
test_csv

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

test_csv.iloc[:, 1:] = test_csv.iloc[:, 1:].astype(float)

test_scaled = scaler.fit_transform(test_csv.iloc[:, 1:])
test_csv.iloc[:, 1:] = test_scaled
test_csv

POLYNOMIAL REGRESSION

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
# from sklearn.metrics import mean_squared_error, r2_score
# import numpy as np

# Assuming X_train, X_test, Y_train, Y_test are already defined and scaled if necessary

# Polynomial Regression - Different Orders
orders = []
for i in range(1, 11):
    orders.append(i)

# Lists to store the errors and R2 scores
train_errors = []
test_errors = []
r2_score_list = []
# final_test = []

for order in orders:
    # Create an instance of PolynomialFeatures with the specified degree
    poly_features = PolynomialFeatures(degree=order)
    X_train_poly = poly_features.fit_transform(X=X_train)
    X_test_poly = poly_features.transform(X=X_test)

    # test_poly = poly_features.fit_transform(X=X_test)
    # final_test = test_poly

    ridge_reg = Ridge(alpha=0.4)
    ridge_reg.fit(X=X_train_poly, y=Y_train)

    # y_pred = ridge_reg.predict(poly_features.transform(X=X_test_poly))
    # y_pred = ridge_reg.predict(poly_features.fit_transform(X=test_csv))

    print(f'Order = {order}, Training Ridge score = {ridge_reg.score(X=X_train_poly, y=Y_train)}, Testing Ridge score = {ridge_reg.score(X=X_test_poly, y=Y_test)}')