<a href="https://colab.research.google.com/github/ttderessa/Temesgen-Deressa/blob/main/Problematic_internet_Use_Child_Mind_Institute.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
 """
 Description: This script preprocesses data, trains multiple machine learning models,
 and evaluates them using a Voting Classifier for Child Mind Institute — Problematic Internet Use
 """
# ============================================
# ============================================
# 1. Data Manipulation and Numerical Operations
# ============================================
import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations

# ============================================
# 2. Machine Learning Libraries
# ============================================
from sklearn.model_selection import train_test_split  # For splitting the dataset
from sklearn.linear_model import LogisticRegression  # For logistic regression
from sklearn.ensemble import (
    RandomForestClassifier,  # Random Forest Classifier
    AdaBoostClassifier,  # Adaptive Boosting
    GradientBoostingClassifier,  # Gradient Boosting
    VotingClassifier  # Ensemble voting classifier
)
from sklearn.tree import DecisionTreeClassifier  # Decision tree
from xgboost import XGBClassifier  # Extreme Gradient Boosting Classifier

# ============================================
# 3. Preprocessing and Feature Scaling
# ============================================
from sklearn.preprocessing import StandardScaler, LabelEncoder  # For scaling and encoding
from sklearn.impute import SimpleImputer  # For handling missing values (simple strategy)

# Explicitly enable IterativeImputer (experimental)
from sklearn.experimental import enable_iterative_imputer  # This is required to use IterativeImputer
from sklearn.impute import IterativeImputer  # Advanced imputation (experimental)

# ============================================
# 4. Model Evaluation Metrics
# ============================================
from sklearn.metrics import (
    accuracy_score,  # For model accuracy
    precision_score,  # For precision metric
    recall_score,  # For recall metric
    f1_score,  # For F1-score
    roc_auc_score,  # For ROC-AUC score
    confusion_matrix  # For confusion matrix
)

# ============================================
# 5. Visualization Libraries
# ============================================
import seaborn as sns  # For statistical data visualization
import matplotlib.pyplot as plt  # For general plotting

# ============================================
# 6. Model Persistence
# ============================================
import joblib  # To save and load trained models

# ============================================
# 7. Load Training and Test Data from Kaggle Input Path
# ============================================
train_df = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test_df = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')


# ============================================
# 8. Handle Missing Values
# ============================================
def handle_missing_values(train_df, test_df, strategy="mean"):
    """
    Handles missing values in train and test DataFrames.

    Parameters:
        train_df (pd.DataFrame): Training DataFrame.
        test_df (pd.DataFrame): Testing DataFrame.
        strategy (str): Strategy to handle missing values ("mean", "median", or "most_frequent").

    Returns:
        Tuple[pd.DataFrame, pd.DataFrame]: Cleaned training and testing DataFrames.
    """
    # Ensure both DataFrames have the same columns
    common_columns = set(train_df.columns).intersection(set(test_df.columns))
    train_df = train_df[list(common_columns)].copy()
    test_df = test_df[list(common_columns)].copy()

    # Separate numeric and categorical columns
    num_cols = train_df.select_dtypes(include=["number"]).columns
    cat_cols = train_df.select_dtypes(exclude=["number"]).columns

    # Impute missing values for numeric columns
    num_imputer = SimpleImputer(strategy=strategy)
    train_df[num_cols] = num_imputer.fit_transform(train_df[num_cols])
    test_df[num_cols] = num_imputer.transform(test_df[num_cols])

    # Impute missing values for categorical columns
    cat_imputer = SimpleImputer(strategy="most_frequent")
    train_df[cat_cols] = cat_imputer.fit_transform(train_df[cat_cols])
    test_df[cat_cols] = cat_imputer.transform(test_df[cat_cols])

    return train_df, test_df

# ============================================
# 9. Clean Data
# ============================================
try:
    # Check columns before cleaning
    print("Train DataFrame columns:", train_df.columns.tolist())
    print("Test DataFrame columns:", test_df.columns.tolist())

    # Clean missing values
    train_df_cleaned, test_df_cleaned = handle_missing_values(train_df, test_df, strategy="mean")

    # Add the 'sii' column back to `train_df_cleaned`
    if 'sii' in train_df.columns:
        train_df_cleaned['sii'] = train_df['sii']

    # Remove rows with null values in the 'sii' column
    train_df_cleaned = train_df_cleaned.dropna(subset=['sii'])

    # Rename train_df_cleaned back to train_df
    train_df = train_df_cleaned
    test_df = test_df_cleaned

    # Display cleaned data
    print("\nCleaned Train DataFrame (renamed to train_df):")
    print(train_df.info())
    print("\nCleaned Test DataFrame:")
    print(test_df_cleaned.info())

except Exception as e:
    print(f"Error occurred: {e}")

# ============================================
# 10. Check for Remaining Null Values
# ============================================
print("\nNull values in 'sii' column of Train DataFrame:")
print(train_df['sii'].isnull().sum())

# ============================================
# 11. Separate Features and Target Variable
# ============================================
X_train = train_df.drop(columns=['sii'])  # 'sii' is the target variable
y_train = train_df['sii']

# ============================================
# 12. Preprocess Test Data
# ============================================
X_test = test_df.drop(columns=['sii'], errors='ignore')  # Remove 'sii' if it exists in test data

# One-hot encode categorical features in both train_df and test_df
X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, drop_first=True)

# Ensure that train and test data have the same columns after one-hot encoding
X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='left', axis=1, fill_value=0)

# ============================================
# 13. Encode Labels for Target Variable
# ============================================
if y_train.dtype == 'object':
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
elif y_train.dtype == 'float' or not np.issubdtype(y_train.dtype, np.integer):
    y_train = y_train.astype(int)

# ============================================
# 14. Scale Features for Logistic Regression
# ============================================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)  # Scale test data using the same scaler

# ============================================
# 15. Split Data into Training and Validation Sets
# ============================================
X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(
    X_train_scaled, y_train, test_size=0.2, random_state=42
)

# ============================================
# 16. Initialize Models
# ============================================
logreg_model = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=200)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
adaboost_model = AdaBoostClassifier(n_estimators=100, random_state=42)
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)

# ============================================
# 17. Define and Train Voting Classifier
# ============================================
voting_clf = VotingClassifier(
    estimators=[  # List of individual models
        ('logreg', logreg_model),
        ('rf', rf_model),
        ('adaboost', adaboost_model),
        ('gb', gb_model),
        ('dt', dt_model),
        ('xgb', xgb_model)
    ],
    voting='hard'  # Use hard voting for classification
)

# Train the Voting Classifier
voting_clf.fit(X_train_split, y_train_split)

# ============================================
# 18. Evaluate the Voting Classifier
# ============================================
y_valid_pred = voting_clf.predict(X_valid_split)
validation_accuracy = accuracy_score(y_valid_split, y_valid_pred)
print(f"Validation Accuracy (Voting Classifier): {validation_accuracy:.4f}")

# ============================================
# 19. Make Predictions on Test Set
# ============================================
test_predictions = voting_clf.predict(X_test_scaled)
