In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
print("Loading dataset...")
df = pd.read_csv('../dataset/Students_Social_Media_Addiction.csv')
print(f"Dataset shape: {df.shape}")

# 1. Data Exploration
print("\nData Exploration")
print("-" * 50)
# Display basic info
print(df.info())
print(df.describe())

# 2. Check for missing values
print("\nChecking for missing values...")
missing_values = df.isnull().sum()
if missing_values.sum() > 0:
    print(missing_values[missing_values > 0])
    # Handle missing values if needed
    # df.fillna(...) or df.dropna(...)
else:
    print("No missing values found!")

# 3. Encode categorical variables
print("\nEncoding categorical variables...")
categorical_columns = df.select_dtypes(include=['object']).columns
print(f"Categorical columns: {categorical_columns.tolist()}")

# Create copy for processing
df_processed = df.copy()

# Use label encoding for binary variables
for col in categorical_columns:
    if df[col].nunique() == 2:
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df[col])
        print(f"Label encoded {col} with values: {dict(zip(le.classes_, le.transform(le.classes_)))}")
    else:
        # One-hot encoding for variables with more than 2 categories
        dummies = pd.get_dummies(df[col], prefix=col, drop_first=False)
        df_processed = pd.concat([df_processed, dummies], axis=1)
        # Drop original column
        df_processed = df_processed.drop(col, axis=1)
        print(f"One-hot encoded {col} into {dummies.shape[1]} new features")

# 4. Remove any ID columns or irrelevant features
if 'Student_ID' in df_processed.columns:
    df_processed = df_processed.drop('Student_ID', axis=1)
    print("Removed Student_ID column")
    
# 5. Feature engineering (if needed)
print("\nPerforming feature engineering...")
# Example: Calculate ratio features
if 'Avg_Daily_Usage_Hours' in df.columns and 'Sleep_Hours_Per_Night' in df.columns:
    df_processed['Usage_to_Sleep_Ratio'] = df['Avg_Daily_Usage_Hours'] / df['Sleep_Hours_Per_Night']
    print("Added Usage_to_Sleep_Ratio feature")

# 6. Save processed dataset
print("\nSaving processed dataset...")
df_processed.to_csv('../dataset/processed_data.csv', index=False)
print(f"Saved processed dataset with shape: {df_processed.shape}")
print(f"Features available: {df_processed.columns.tolist()}")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Loading dataset...
Dataset shape: (705, 13)

Data Exploration
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 705 entries, 0 to 704
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Student_ID                    705 non-null    int64  
 1   Age                           705 non-null    int64  
 2   Gender                        705 non-null    object 
 3   Academic_Level                705 non-null    object 
 4   Country                       705 non-null    object 
 5   Avg_Daily_Usage_Hours         705 non-null    float64
 6   Most_Used_Platform            705 non-null    object 
 7   Affects_Academic_Performance  705 non-null    object 
 8   Sleep_Hours_Per_Night         705 non-null    float64
 9   Mental_Health_Score           705 non-null    int64  
 10  Relationship_Status           705 non-null    object 
 11  Conflicts_