### Load libraries

In [None]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import LeaveOneOut, cross_val_score, train_test_split, KFold
from sklearn.linear_model import ElasticNet, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet, Ridge, Lasso
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

### Load Model 1 Dataset Input

In [None]:
## CHANGE PATH TO DIRECTORY WHERE THE DATASET FOR MODEL 1 IS LOCATED

os.chdir("/anonymized_path/sakshikumar/Documents/UCD/04. SP 2025/464 Practicum/KWSM finale/FINAL FILES")
data = pd.read_csv("dataset.csv")
data.head()

In [None]:
data.info()

#### Correlation Matrix of all Variables

In [None]:
corr = data.select_dtypes(include=['float64', 'int64']).corr()

plt.figure(figsize=(16,12))
sns.heatmap(corr, cmap='coolwarm', annot=True)

#### Preparing Variables for Modelling

In [None]:
features = [col for col in data.columns if col not in ['URL', 'Keyword', 'Source', 'Has_SERP_Feature', 'Position',
                                                        'Active users',  'New users', 'Returning users', 'Total users', 'Event count',
                                                        'Clicks', 'Transactional', 'Views per session', 'intent', 'Engaged sessions per active user',
                                                        'Sessions']]
data2 = data[features]

corr2 = data2.select_dtypes(include=['float64', 'int64']).corr()
plt.figure(figsize=(16,12))
sns.heatmap(corr2, cmap='coolwarm', annot=True)

#### Imputing Data for Missing Values in Data

In [None]:
numeric_cols = data2.select_dtypes(include=['float64', 'int64']).columns
imputer = SimpleImputer(strategy='median')
data2[numeric_cols] = imputer.fit_transform(data2[numeric_cols])

#### Calculating inverse of Position for Ranking Ease
This is the dependent variable

In [None]:
data2['Position_Inverse'] = 100 / (data['Position'] + 1)  # Transform position to a "higher is better" metric
data2.head()

#### Setting up Variables for Modelling

In [None]:
feats_final = [col for col in data2.columns if col not in ['CPC', 'Position_Inverse', 'Commercial_Trans_Ratio']]

X = data2[feats_final]
y = data2['Position_Inverse']  # Higher value = better position

X.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=57)
print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

#### Scaling Model Variables

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### Model Pipeline - Fitting Model 1 to Data

In [None]:
lasso = Lasso(alpha=0.1, max_iter=50000)
lasso.fit(X_train_scaled, y_train)
print(lasso.dual_gap_)

# Prediction
y_pred = lasso.predict(X_test_scaled)

# 8. Evaluate model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")

#### Extracting and Ordering Variables by Importance (Coefficient size)

In [None]:
feature_importance = pd.DataFrame({
    'Feature': feats_final,
    'Coefficient': lasso.coef_
})
feature_importance['Absolute_Coefficient'] = abs(feature_importance['Coefficient'])
feature_importance = feature_importance.sort_values('Absolute_Coefficient', ascending=False)
feature_importance

#### Structuring Final Output
Calculating Opportunity Score for Final Keywords Output

In [None]:
X_all_scaled = scaler.transform(X)
predicted_inverse = lasso.predict(X_all_scaled)
data['Predicted_Position'] = (100 / predicted_inverse) - 1

data['Opportunity_Score'] = ((data['Commercial'] + data['Transactional']) * data['Volume']) / (data['KD'] + 1) / (data['Predicted_Position'] + 1)

# Sort by opportunity score to find highest potential keywords
best_keywords = data.sort_values('Opportunity_Score', ascending=False)
print("\nTop 20 Keywords to Target Based on Lasso Model:")
# result1 = best_keywords[['Keyword', 'Opportunity_Score', 'Position', 'Volume', 'KD', 'Commercial', 'Transactional']].sort_values(['Commercial', 'Transactional', 'KD'], 
#                                                                                                                                ascending=[False, False, True])

result1 = best_keywords[['Keyword', 'Opportunity_Score', 'Position', 'Predicted_Position', 'Volume', 'KD', 'Commercial', 'Transactional']].sort_values('Opportunity_Score', ascending=False)
result1.head(15)

#### Export to CSV

In [None]:
## CHANGE PATH TO YOUR DIRECTORY TO SAVE THE MODEL 1 OUTPUT
result1.to_csv('Model1_Table.csv', index=False)