In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Load preprocessed dataset
df = pd.read_csv("Final_Dataset.csv")

# -------------------------------
#  Feature Exploration
# -------------------------------
# Floating-point features
float_features = df.select_dtypes(include=["float64"]).columns
print("Total number of Floating features:", float_features.shape[0])
print("Floating feature Names:", float_features.tolist())

# Integer features
integer_features = df.select_dtypes(include=["int64"]).columns
print("Total number of Integer features:", integer_features.shape[0])
print("Integer feature Names:", integer_features.tolist())

# Categorical features
categorical_features = df.select_dtypes(include=["object"]).columns
print("Total number of Categorical features:", categorical_features.shape[0])
print("Categorical feature Names:", categorical_features.tolist())

# Example: unique values in categorical column "QuestionKey"
if "QuestionKey" in df.columns:
    print("Unique QuestionKey values:", df["QuestionKey"].unique())

# -------------------------------
# Feature Engineering
# -------------------------------
# Drop unnecessary columns if still present
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
if "TimeStamp" in df.columns:
    df.drop(columns=["TimeStamp"], inplace=True)

# Label encode categorical features
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    print(f"Encoded column: {col}")

# Example: if 'QuestionKey' exists, encode separately then drop
if "QuestionKey" in df.columns:
    le = LabelEncoder()
    df['QuestionKey_encoded'] = le.fit_transform(df['QuestionKey'])
    df.drop('QuestionKey', axis=1, inplace=True)

# Downcast float columns (except large ones)
li = []
for col in df.select_dtypes(include="number").columns:
    if df[col].max() > 5400:
        li.append(col)

for col in float_features:
    if col not in li:
        df[col] = df[col].astype("float16")

print("✅ Feature engineering complete")
print(df.head())

# -------------------------------
# Correlation Matrix + Heatmap
# -------------------------------
corr = df.select_dtypes(include=['number']).corr()
print("\nCorrelation Matrix:\n", corr)

plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix Heatmap")
plt.show()

# -------------------------------
# Save engineered dataset
# -------------------------------
df.to_csv("df_features.csv", index=False)
print("✅ Feature engineered dataset saved as df_features.csv")
