In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error


In [2]:
df = pd.read_csv('/kaggle/input/fairs-extramarital-affairs-data/Affairs.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,affairs,gender,age,yearsmarried,children,religiousness,education,occupation,rating
0,4,0,male,37.0,10.0,no,3,18,7,4
1,5,0,female,27.0,4.0,no,4,14,6,4
2,11,0,female,32.0,15.0,yes,1,12,1,4
3,16,0,male,57.0,15.0,yes,5,18,6,5
4,23,0,male,22.0,0.75,no,2,17,6,3


In [3]:
df.loc[df.gender== 'male', 'gender'] = 0
df.loc[df.gender== 'female', 'gender'] = 1
df['gender'] = df['gender'].astype(int)
df.loc[df.children== 'no', 'children'] = 0
df.loc[df.children== 'yes', 'children'] = 1
df['children'] = df['children'].astype(int)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,affairs,gender,age,yearsmarried,children,religiousness,education,occupation,rating
0,4,0,0,37.0,10.0,0,3,18,7,4
1,5,0,1,27.0,4.0,0,4,14,6,4
2,11,0,1,32.0,15.0,1,1,12,1,4
3,16,0,0,57.0,15.0,1,5,18,6,5
4,23,0,0,22.0,0.75,0,2,17,6,3


In [5]:
X = df[['religiousness', 'age', 'gender', 'yearsmarried', 'education', 'occupation', 'affairs']]
y = df['rating']

## base model

In [6]:
regression = LinearRegression()
regression.fit(X,y)
first_model = (mean_squared_error(y_true=y,y_pred=regression.predict(X)))
print(first_model)

1.049873864469667


Benchmark is MSE = 1.05. Coeffients of first model is below. Loop go through model and zip fucntion to combine two columns. 

In [7]:
coef_dict_baseline = {}
for coef, feat in zip (regression.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'religiousness': 0.042352811106391774,
 'age': -0.009059645428673824,
 'gender': 0.08882013337087079,
 'yearsmarried': -0.030458802565476582,
 'education': 0.06810255742293711,
 'occupation': -0.005979506852998218,
 'affairs': -0.07882571247653963}

## Elastic net

In [8]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import numpy as np

# Initialize an instance of ElasticNet
elastic = ElasticNet()

# Create a scaler for feature normalization
scaler = StandardScaler()

# Define the parameter grid for grid search
param_grid = {
    'alpha': np.logspace(-5, 2, 8),
    'l1_ratio': [.2, .4, .6, .8]
}

# Create the GridSearchCV object
search = GridSearchCV(
    estimator=elastic,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    n_jobs=1,
    refit=True,
    cv=10
)

# Fit the search object to your data
search.fit(scaler.fit_transform(X), y)

# Access the best parameters and best score
print(search.best_params_)
print(abs(search.best_score_))


{'alpha': 0.1, 'l1_ratio': 0.2}
1.082099130262313


In [9]:
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Initialize an instance of ElasticNet
elastic = ElasticNet(alpha=0.1, l1_ratio=0.2)

# Create a scaler for feature normalization
scaler = StandardScaler()

# Normalize the input features
X_normalized = scaler.fit_transform(X)

# Fit the ElasticNet model to the data
elastic.fit(X_normalized, y)

# Calculate the mean squared error
second_model = mean_squared_error(y_true=y, y_pred=elastic.predict(X_normalized))
print(second_model)


1.054803668959378


In [10]:
coef_dict_baseline = {}
for coef, feat in zip(elastic.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'religiousness': 0.022384075219597198,
 'age': -0.07862087999645949,
 'gender': 0.010257030159819398,
 'yearsmarried': -0.1412598148984351,
 'education': 0.112202230296599,
 'occupation': -0.0,
 'affairs': -0.23202450198262234}

occupation removed