In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


import warnings
warnings.simplefilter("ignore")

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [8]:
data=pd.read_csv("data.csv")
data.head()

Unnamed: 0,Area,Sensing Range,Transmission Range,Number of Sensor nodes,Number of Barriers
0,5000,15,30,100,30
1,5000,16,32,112,35
2,5000,17,34,124,42
3,5000,18,36,136,48
4,5000,19,38,148,56


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182 entries, 0 to 181
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   Area                    182 non-null    int64
 1   Sensing Range           182 non-null    int64
 2   Transmission Range      182 non-null    int64
 3   Number of Sensor nodes  182 non-null    int64
 4   Number of Barriers      182 non-null    int64
dtypes: int64(5)
memory usage: 7.2 KB


In [10]:
def plot_boxplot(data,ft):
    data.boxplot(column=[ft])
    plt.grid(False)
    plt.show()

In [11]:
def outliers(data,ft):
    Q1=data[ft].quantile(0.25)
    Q3=data[ft].quantile(0.75)
    IQR=Q3-Q1
    lower_bound=Q1-1.5*IQR
    upper_bound=Q3+1.5*IQR

    ls=data.index[ (data[ft] < lower_bound) | (data[ft]> upper_bound) ]

    return ls

In [13]:
index_list=[]
for feature in ["Area","Sensing Range","Transmission Range","Number of Sensor nodes","Number of Barriers"]:
    index_list.extend(outliers(data,feature))
index_list

[22, 23, 24, 25]

In [14]:
def remove(data,ls):
    ls=sorted(set(ls))
    data=data.drop(ls)
    return data

In [15]:
data_cleaned=remove(data,index_list)

In [16]:
data_cleaned

Unnamed: 0,Area,Sensing Range,Transmission Range,Number of Sensor nodes,Number of Barriers
0,5000,15,30,100,30
1,5000,16,32,112,35
2,5000,17,34,124,42
3,5000,18,36,136,48
4,5000,19,38,148,56
...,...,...,...,...,...
177,50000,36,72,352,101
178,50000,37,74,364,107
179,50000,38,76,376,114
180,50000,39,78,388,121


In [17]:
data_cleaned.isnull().sum()

Area                      0
Sensing Range             0
Transmission Range        0
Number of Sensor nodes    0
Number of Barriers        0
dtype: int64

In [7]:
#df.drop(columns=["subject#"],inplace=True)

In [18]:
X=df.drop("Number of Barriers",axis=1)
y=df["Number of Barriers"]

In [19]:

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=9)

from sklearn.linear_model import LinearRegression
model1=LinearRegression()
model1.fit(X_train,y_train)

from sklearn.model_selection import cross_val_score
print("Intercept:",model1.intercept_)
print("coefficients:",model1.coef_)

train_predictions=model1.predict(X_train)
test_predictions=model1.predict(X_test)

print("Train R2:",model1.score(X_train,y_train))
print("Test R2:",model1.score(X_test,y_test))
print("cross validation score:",cross_val_score(model1,X,y,cv=5).mean())

Intercept: -11.546999356790266
coefficients: [-0.00177162  0.0474344   0.09486881  0.56921284]
Train R2: 0.8753323532551173
Test R2: 0.9031898662833902
cross validation score: 0.7596905128322411


In [20]:
import statsmodels.formula.api as smf
model2=smf.ols("y~X",data=df).fit()
model2.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.882
Model:,OLS,Adj. R-squared:,0.881
Method:,Least Squares,F-statistic:,670.0
Date:,"Tue, 23 Apr 2024",Prob (F-statistic):,7.62e-84
Time:,07:43:18,Log-Likelihood:,-823.37
No. Observations:,182,AIC:,1653.0
Df Residuals:,179,BIC:,1662.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0679,0.026,-2.593,0.010,-0.120,-0.016
X[0],-0.0018,0.000,-16.503,0.000,-0.002,-0.002
X[1],-0.3885,0.167,-2.321,0.021,-0.719,-0.058
X[2],-0.7771,0.335,-2.321,0.021,-1.438,-0.116
X[3],0.7673,0.086,8.954,0.000,0.598,0.936

0,1,2,3
Omnibus:,76.74,Durbin-Watson:,0.262
Prob(Omnibus):,0.0,Jarque-Bera (JB):,207.887
Skew:,1.837,Prob(JB):,7.21e-46
Kurtosis:,6.731,Cond. No.,4.29e+19


In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=13)

In [22]:
from sklearn.preprocessing import PolynomialFeatures
polynomial_converter=PolynomialFeatures(degree=2)
X_train_poly=pd.DataFrame(polynomial_converter.fit_transform(X_train))

from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_train_poly,y_train)

ypred_train=model.predict(X_train_poly)

from sklearn.metrics import mean_squared_error,r2_score
print("Train RMSE:",np.sqrt(mean_squared_error(y_train,ypred_train)))
print("Train R2:",r2_score(y_train,ypred_train))

from sklearn.model_selection import cross_val_score
print("Cross Validation Score:",cross_val_score(model,X_train_poly,y_train,cv=5).mean())


Train RMSE: 8.66932062973149
Train R2: 0.9811266890624486
Cross Validation Score: 0.9770759708343613


In [23]:
X_test_poly=pd.DataFrame(polynomial_converter.transform(X_test))

ypred_test=model.predict(X_test_poly)

print("Test RMSE:",np.sqrt(mean_squared_error(y_test,ypred_test)))
print("Test R2:",r2_score(y_test,ypred_test))

Test RMSE: 9.165516886535158
Test R2: 0.983299235484626


In [32]:
train_r2=[]
test_r2=[]

for i in range(1,10):
    polynomial_converter=PolynomialFeatures(degree=i)
    X_train_poly=pd.DataFrame(polynomial_converter.fit_transform(X_train))

    model=LinearRegression()
    model.fit(X_train_poly,y_train)

    train_pred=model.predict(X_train_poly)
    train_r2.append(model.score(X_train_poly,y_train))

    X_test_poly=pd.DataFrame(polynomial_converter.transform(X_test))

    test_pred=model.predict(X_test_poly)
    test_r2.append(model.score(X_test_poly,y_test))

In [33]:
train_r2

[0.8753323532551173,
 0.9805032763932127,
 0.9968659464187524,
 0.9995523740311517,
 0.999699435656276,
 0.9409278855194873,
 0.9029377925512312,
 0.7605413376422427,
 0.5639356455011562]

In [34]:
test_r2

[0.90318986628339,
 0.9861800910738064,
 0.9971298822538031,
 0.9994167660797556,
 0.9995332175886285,
 0.9337906336932731,
 0.8133378384907464,
 0.651065538801431,
 0.4089117836082178]

In [35]:
final_poly_converter=PolynomialFeatures(degree=5)
X_train_poly=pd.DataFrame(final_poly_converter.fit_transform(X_train))

final_model=LinearRegression()
final_model.fit(X_train_poly,y_train)

train_pred=final_model.predict(X_train_poly)
print("train R2:",final_model.score(X_train_poly,y_train))
print("Cross validation score:",cross_val_score(model,X_train_poly,y_train,cv=5).mean())

X_test_poly=pd.DataFrame(final_poly_converter.transform(X_test))
test_pred=final_model.predict(X_test_poly)
print("test R2:",final_model.score(X_test_poly,y_test))

train R2: 0.999699435656276
Cross validation score: 0.9995491179485677
test R2: 0.9995332175886285


In [24]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=9)

In [25]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
estimator=Lasso()
param_grid={"alpha":list(range(1,100))}

model_hp=GridSearchCV(estimator,param_grid,cv=5,scoring='r2')

model_hp.fit(X_train,y_train)
model_hp.best_params_

{'alpha': 99}

In [26]:
from sklearn.linear_model import Lasso
lasso_best=Lasso(alpha=99)
lasso_best.fit(X_train,y_train)

print("Intercept:",lasso_best.intercept_)
print("coefficients:",lasso_best.coef_)

ypred_train=lasso_best.predict(X_train)
from sklearn.metrics import r2_score
print("train r2:",r2_score(y_train,ypred_train))
from sklearn.model_selection import cross_val_score
print("Cv score:",cross_val_score(lasso_best,X_train,y_train,cv=5).mean())

ypred_test=lasso_best.predict(X_test)
print("test r2:",r2_score(y_test,ypred_test))


Intercept: -6.7874303495966615
coefficients: [-0.00177397  0.          0.          0.57673261]
train r2: 0.875040885598322
Cv score: 0.8614317392516752
test r2: 0.8993477946686774


In [27]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
estimator=Ridge()

param_grid={"alpha":list(range(1,100))}

model_hp=GridSearchCV(estimator,param_grid,cv=5,scoring='r2')

model_hp.fit(X_train,y_train)
model_hp.best_params_

{'alpha': 99}

In [28]:
ridge_best=Ridge(alpha=99)
ridge_best.fit(X_train,y_train)

print("intercept:",ridge_best.intercept_)
print("coefficients:",ridge_best.coef_)

ypred_train=ridge_best.predict(X_train)
print("train r2:",r2_score(y_train,ypred_train))
print("cv Score:",cross_val_score(ridge_best,X_train,y_train,cv=5).mean())

ypred_test=ridge_best.predict(X_test)
print("test r2:",r2_score(y_test,ypred_test))

intercept: -13.969820976654404
coefficients: [-0.0018914   0.04948742  0.09897484  0.59384905]
train r2: 0.8829194944262496
cv Score: 0.8709011753871903
test r2: 0.8577364086697425


In [29]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=9)

from sklearn.linear_model import ElasticNet
enr_base=ElasticNet()
enr_base.fit(X_train,y_train)

train_predictions=enr_base.predict(X_train)
test_predictions=enr_base.predict(X_test)

print("train R2:",enr_base.score(X_train,y_train))
print("test R2:",enr_base.score(X_test,y_test))
from sklearn.model_selection import cross_val_score
print("cross Validation Score:",cross_val_score(enr_base,X,y,cv=5).mean())

train R2: 0.8753323344833486
test R2: 0.9031612985187849
cross Validation Score: 0.7597450234303746


In [30]:
from sklearn.model_selection import GridSearchCV

estimator=ElasticNet()

param_grid={"alpha":[0.1,0.2,1,2,3,5,10],"l1_ratio":[0.1,0.5,0.75,0.9,0.95,1]}

model_hp=GridSearchCV(estimator,param_grid,cv=5,scoring='neg_mean_squared_error')
model_hp.fit(X_train,y_train)
model_hp.best_params_

{'alpha': 10, 'l1_ratio': 0.5}

In [31]:
enr_best=ElasticNet(alpha=10,l1_ratio=0.5)
enr_best.fit(X_train,y_train)

print("intercept:",enr_best.intercept_)
print("coefficients",enr_best.coef_)

train_predictions=enr_best.predict(X_train)
test_predictions=enr_best.predict(X_test)

print("Train R2:",enr_best.score(X_train,y_train))
print("test R2:",enr_best.score(X_test,y_test))
print("cross validation score:",cross_val_score(enr_best,X,y,cv=5).mean())

intercept: -9.710595960996727
coefficients [-0.00177182  0.          0.          0.58799505]
Train R2: 0.8753304781663882
test R2: 0.9029027187274774
cross validation score: 0.7602319864712601
