### Install Packages

In [None]:
pip install google-analytics-data

### Load libraries

In [None]:
import os
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [None]:
## Set working directory - CHANGE THIS TO YOUR PATH
os.chdir("/anonymized_path/sakshikumar/Documents/UCD/04. SP 2025/464 Practicum/KWSM finale/FINAL FILES")

### Fetch GA4 API data

In [None]:
from google.analytics.data_v1beta import BetaAnalyticsDataClient
from google.analytics.data_v1beta.types import DateRange, Dimension, Metric, RunReportRequest
from google.oauth2 import service_account

creds = service_account.Credentials.from_service_account_file("credentials.json")
client = BetaAnalyticsDataClient(credentials=creds)

property_id = "359323282"

request = RunReportRequest(
    property=f"properties/{property_id}",
    dimensions=[
        Dimension(name="pagePath"),
        Dimension(name="eventName")
    ],
    metrics=[Metric(name="eventCount")],
    date_ranges=[DateRange(start_date="2024-05-10", end_date="today")],
)

response = client.run_report(request)

rows = []
for row in response.rows:
    rows.append([dim.value for dim in row.dimension_values] + [metric.value for metric in row.metric_values])

import pandas as pd
df = pd.DataFrame(rows, columns=["pagePath", "eventName", "eventCount"])

In [None]:
df.head(10)

In [None]:
list_df = pd.DataFrame(df['pagePath'].unique())
list_df.to_csv('pagePath.csv', index=False)

### Loading Model 2 Dataset Input

In [None]:
# 1. Load and prepare your dataset
attempt2_df = pd.read_csv('dataset.csv')

# Drop duplicates
attempt2_df = attempt2_df.drop_duplicates()

# 2. Define new target: High Engagement (over 30 seconds per session)
# attempt2_df['High_Engagement'] = (attempt2_df['Average engagement time per session'] > 30).astype(int)
attempt2_df['Low_Bounce'] = (attempt2_df['Bounce rate'] < 0.3).astype(int)

print("Low Bounce distribution:\n", attempt2_df['Low_Bounce'].value_counts())


In [None]:
attempt2_df['Bounce rate'].value_counts()

### Preparing variables for Modelling

In [None]:
# 3. Prepare Features
drop_cols = ['URL', 'Keyword', 'pagePath'] if 'pagePath' in attempt2_df.columns else ['URL', 'Keyword']
X = attempt2_df.drop(columns=drop_cols + ['Low_Bounce'])
y = attempt2_df['Low_Bounce']

#### Drawing Correlation Matrix to check for Multicolinearity

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(X.corr(), cmap='coolwarm', center=0)
plt.title('Feature Correlation Matrix')
plt.show()

In [None]:
# 1. Compute correlation matrix
corr_matrix = X.corr().abs()

# 2. Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# 3. Find features with correlation greater than threshold
threshold = 0.85
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

print(f"✅ Features to drop due to high correlation (> {threshold}):")
print(to_drop)

# 4. Drop them
X_reduced = X.drop(columns=to_drop)

print(f"\nShape before dropping: {X.shape}")
print(f"Shape after dropping: {X_reduced.shape}")

### Model Pipeline

In [None]:
model_pipeline_reduced = Pipeline([
    ('smote', SMOTE(random_state=42, k_neighbors=1)),
    ('logreg', LogisticRegression(class_weight='balanced', max_iter=50000, random_state=42))
])

#### Cross-Validation Modelling

In [None]:
# 3. Define cross-validation
scoring = ['accuracy', 'precision', 'recall', 'f1']
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=90)

# 4. Perform cross-validation
cv_results_reduced = cross_validate(
    model_pipeline_reduced,
    X_reduced,  # Now using reduced feature set!
    y,
    cv=cv,
    scoring=scoring,
    return_train_score=False
)

# 5. Summarize results
print("\n✅ Cross-Validation Results (Reduced Feature Set):\n")

for metric in scoring:
    print(f"{metric.capitalize()} (Test Set): {cv_results_reduced[f'test_{metric}']}")
    print(f"Mean {metric.capitalize()}: {np.mean(cv_results_reduced[f'test_{metric}']):.4f}")
    print("-" * 40)


In [None]:
# 6. Overview DataFrame
cv_summary_reduced = pd.DataFrame({
    metric: [np.mean(cv_results_reduced[f'test_{metric}'])] for metric in scoring
})

print("\n✅ Cross-Validation Metric Summary (Reduced Set, Mean over 5 folds):\n")
print(cv_summary_reduced)

### Fitting the Model Pipeline

In [None]:
model_pipeline_reduced.fit(X_reduced, y)

coefficients_reduced = model_pipeline_reduced.named_steps['logreg'].coef_[0]
features_reduced = X_reduced.columns

feature_importances_reduced = pd.DataFrame({
    'Feature': features_reduced,
    'Coefficient': coefficients_reduced
}).sort_values('Coefficient', key=lambda x: abs(x), ascending=False)

### Plotting Feature Importance

In [None]:
# Plot
plt.figure(figsize=(9, 5))
plt.barh(feature_importances_reduced['Feature'], feature_importances_reduced['Coefficient'])
plt.xlabel('Coefficient')
plt.title('Feature Importances (Reduced Feature Set)')
plt.axvline(0, color='black', linewidth=0.8)
plt.gca().invert_yaxis()
plt.grid(axis='x')
plt.tight_layout()
plt.show()

### Predict Probability of Engagement

In [None]:
# 2. Predict probabilities
predicted_probs = model_pipeline_reduced.predict_proba(X_reduced)[:, 1]  # Probability of class 1 (Low Bounce)

# 3. Attach probabilities to your dataframe
attempt2_df_reduced = attempt2_df.copy()  # Original full dataset
attempt2_df_reduced = attempt2_df_reduced.loc[X_reduced.index]  # Align to reduced features
attempt2_df_reduced['Predicted_Engagement_Prob'] = predicted_probs

# 4. View
print("\nPredicted Probabilities of Low Bounce (High Engagement):\n")
attempt2_df_reduced[['Keyword', 'Predicted_Engagement_Prob']].sort_values('Predicted_Engagement_Prob', ascending=False).head(10)

### Export to a CSV for Analysis

In [None]:
### CHANGE THIS TO A DIRECTORY WHERE YOU WANT TO SAVE THE FILE
# Save the reduced dataset with predictions
attempt2_df_reduced.to_csv('/anonymized_path/sakshikumar/Documents/UCD/04. SP 2025/464 Practicum/KWSM finale/collated_dataset.csv', index=False)

-----------

### Merge Model 1 Results for Final Output

#### Load Model 1 Results

In [None]:
### CHANGE THIS TO A DIRECTORY WHERE YOU HAVE SAVED MODEL 1 RESULTS
## Load Model 1 results
model1_res = pd.read_csv("/anonymized_path/sakshikumar/Documents/UCD/04. SP 2025/464 Practicum/KWSM finale/Model1_Table.csv")
model1_res.head()

#### Display Model 2 Input Dataset

In [None]:
attempt2_df.head()

#### Merge the 2 Datasets

In [None]:
merged_keywords = pd.merge(model1_res, attempt2_df_reduced, on='Keyword', how='inner')
print(f"Merged dataset shape: {merged_keywords.shape}")
merged_keywords.head()  

#### Clean Merged Dataset

In [None]:
# Clean merged dataset
drop_cols = ['Commercial_y', 'Transactional_y', 'Position_y', 'KD_y', 'Volume_y']
merged_keywords = merged_keywords.drop(columns=drop_cols)
merged_keywords.rename(columns={
    'Commercial_x': 'Commercial',
    'Transactional_x': 'Transactional',
    'Position_x': 'Position',
    'KD_x': 'KD',
    'Volume_x': 'Volume'
}, inplace=True)

merged_keywords.head()

#### Applying Filters to Obtain Final Output

In [None]:
# 3. Filter for Commercial or Transactional intent
high_intent_keywords = merged_keywords[
    (merged_keywords['Commercial'] == 1) | (merged_keywords['Transactional'] == 1)
]

# 4. Filter for low KD (e.g., KD <= 20)
# low_kd_keywords = high_intent_keywords[high_intent_keywords['KD'] <= 20]

# 5. Filter for High Predicted Engagment
high_engagement_keywords = high_intent_keywords[high_intent_keywords['Predicted_Engagement_Prob'] >= 0.7]

# Select relevant columns
final_keywords = high_engagement_keywords[['Keyword', 'Commercial', 'Transactional', 'KD', 'Volume', 'Opportunity_Score', 'Position', 'Predicted_Position', 'Predicted_Engagement_Prob']]
final_keywords.sort_values(by='Predicted_Position', ascending=True)

#### Calculating New Opportunity Score and Ordering Final Output

In [None]:
final_keywords['New_Opportunity_Score'] = (
    (final_keywords['Commercial'] + final_keywords['Transactional']) *
    final_keywords['Volume'] *
    final_keywords['Predicted_Engagement_Prob']
) / (
    (final_keywords['KD'] + 1) * (final_keywords['Predicted_Position'] + 1)
)

# Sort by new score
final_keywords_sorted = final_keywords.sort_values('New_Opportunity_Score', ascending=False)

# View final recommended keywords
print("\n✅ Final Ranked Keywords based on New Opportunity Score:\n")
final_keywords_sorted[['Keyword', 'Commercial', 'Transactional', 'KD', 'Volume', 'New_Opportunity_Score', 'Position', 'Predicted_Position', 'Predicted_Engagement_Prob']]

In [None]:
## CHANGE THIS TO A DIRECTORY WHERE YOU WANT TO SAVE THE FINAL KEYWORDS
# Save final keywords to CSV
final_keywords_sorted.to_csv('/anonymized_path/sakshikumar/Documents/UCD/04. SP 2025/464 Practicum/KWSM finale/final_keywords.csv', index=False)