In [1]:
import pandas as pd

# Sample dataset: Study hours, previous exam scores, and pass/fail labels
data = {'StudyHours': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        'PrevExamScore': [30, 40, 45, 50, 60, 65, 70, 75, 80, 85],
        'Pass': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]}  # 0 = Fail, 1 = Pass

df = pd.DataFrame(data)

# Define features and target variable
X = df[['StudyHours', 'PrevExamScore']]
y = df['Pass']

In [2]:
import statsmodels.api as sm

# Add a constant (intercept) to the features
X = sm.add_constant(X)

# Fit the Ordinary Least Squares (OLS) regression model
model = sm.OLS(y, X).fit()

# Display the model summary (including p-values)
print(model.summary())

# Remove feature with highest p-value (if greater than 0.05)
if model.pvalues['StudyHours'] > 0.05:
    X = X.drop(columns='StudyHours')
    model = sm.OLS(y, X).fit()

# Final model after backward elimination
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   Pass   R-squared:                       0.758
Model:                            OLS   Adj. R-squared:                  0.688
Method:                 Least Squares   F-statistic:                     10.94
Date:                Thu, 15 May 2025   Prob (F-statistic):            0.00701
Time:                        16:26:52   Log-Likelihood:               -0.17258
No. Observations:                  10   AIC:                             6.345
Df Residuals:                       7   BIC:                             7.253
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -0.3333      1.464     -0.228

  return hypotest_fun_in(*args, **kwds)
  return hypotest_fun_in(*args, **kwds)


In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

def forward_selection(X, y):
    remaining_features = set(X.columns)
    selected_features = []
    current_score = 0.0
    
    while remaining_features:
        scores_with_candidates = []
        
        for feature in remaining_features:
            features_to_test = selected_features + [feature]
            X_train, X_test, y_train, y_test = train_test_split(X[features_to_test], y, test_size=0.2, random_state=42)
            
            # Train the model
            model = LinearRegression()
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            score = r2_score(y_test, y_pred)
            
            scores_with_candidates.append((score, feature))
        
        # Select the feature with the highest R-squared score
        scores_with_candidates.sort(reverse=True)
        best_score, best_feature = scores_with_candidates[0]
        
        if current_score < best_score:
            remaining_features.remove(best_feature)
            selected_features.append(best_feature)
            current_score = best_score
        else:
            break
    
    return selected_features

# Run forward selection
best_features = forward_selection(X, y)
print(f"Selected features using Forward Selection: {best_features}")

Selected features using Forward Selection: ['PrevExamScore']


In [4]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the LASSO model with a regularization parameter (alpha)
lasso_model = Lasso(alpha=0.1)

# Train the LASSO model
lasso_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = lasso_model.predict(X_test)

# Evaluate the model's performance using R-squared
r2 = r2_score(y_test, y_pred)
print(f'R-squared score: {r2}')

# Display the LASSO coefficients
print(f'LASSO Coefficients: {lasso_model.coef_}')

R-squared score: 0.9997884297520662
LASSO Coefficients: [0.         0.02463636]
