In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = pd.read_csv('mobile_addiction.csv',encoding='utf-8')

# Knowing The Data

In [None]:
#know the shape of the data
print(df.shape)

In [None]:
#know the data types of the coulumns
print(df.info())

In [None]:
#take a look of the data
df.head()

In [None]:
#descriing numerical values
df.describe()

In [None]:
#descriing categorical values
df.describe(include='object')

In [None]:
#to get columns names
df.columns

In [None]:
df['addicted'].value_counts()

In [None]:
#Check for null values
print(df.isnull().sum())

In [None]:
# Drop unnecessary index column
df = df.drop(columns=['Unnamed: 0'])

In [None]:
#to get columns names
df.columns

In [None]:
#to show outliers for all numerical columns
for column in df.select_dtypes(include=['number']).columns:
	fig = px.box(df, x=column, title= f'Box plot for {column}')
	fig.show()


*Almost there is no outliers in this data set*

In [None]:
features = [
    'daily_screen_time', 'app_sessions', 'social_media_usage',
    'gaming_time', 'notifications', 'night_usage', 'age',
    'work_study_hours', 'stress_level', 'apps_installed'
]

for feature in features:
    binned_col = f'{feature}_binned'

    try:
        unique_vals = df[feature].nunique()

        # Use qcut for features with many unique values
        if unique_vals >= 10:
            bin_edges = pd.qcut(df[feature], q=5, retbins=True, duplicates='drop')[1]
            num_bins = len(bin_edges) - 1
            labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High'][:num_bins]
            df[binned_col] = pd.qcut(df[feature], q=num_bins, labels=labels)
        else:
            # Fallback: cut into equal-width bins
            df[binned_col] = pd.cut(df[feature], bins=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

        # Plot
        sns.countplot(x=binned_col, hue='addicted', data=df)
        plt.title(f'Addiction Status by {feature.replace("_", " ").title()}')
        plt.xlabel(feature.replace('_', ' ').title())
        plt.ylabel('Count')
        plt.xticks(rotation=30)
        plt.tight_layout()
        plt.show()

    except Exception as e:
        print(f"Skipping {feature} due to error: {e}")

I noticed that in the social media, gaming time, and night usage, when the time increased I got fewer addicted persons so I decided to calculate the proportions within each bin instead of count


In [None]:
# Features to visualize
features = [
    'daily_screen_time', 'app_sessions', 'social_media_usage',
    'gaming_time', 'notifications', 'night_usage', 'age',
    'work_study_hours', 'stress_level', 'apps_installed'
]

# Loop through each feature
for feature in features:
    binned_col = f'{feature}_binned'

    unique_vals = df[feature].nunique()

    # Choose binning strategy based on value spread
    if unique_vals >= 10:
        # Quantile binning
        bin_edges = pd.qcut(df[feature], q=5, retbins=True, duplicates='drop')[1]
        num_bins = len(bin_edges) - 1
        labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High'][:num_bins]
        df[binned_col] = pd.qcut(df[feature], q=num_bins, labels=labels)
    else:
        # Equal-width binning for low-uniqueness features
        df[binned_col] = pd.cut(df[feature], bins=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

    # Group and calculate percentage with observed=True to suppress warnings
    grouped = df.groupby([binned_col, 'addicted'], observed=True).size().reset_index(name='count')
    total_per_bin = grouped.groupby(binned_col, observed=True)['count'].transform('sum')
    grouped['percentage'] = grouped['count'] / total_per_bin * 100

    # Pivot for stacked bar plot
    pivot = grouped.pivot(index=binned_col, columns='addicted', values='percentage').fillna(0)

    # SAFELY rename columns based on presence
    column_map = {0: 'Not Addicted', 1: 'Addicted'}
    pivot.columns = [column_map.get(c, str(c)) for c in pivot.columns]


    # Plot
    pivot.plot(kind='bar', stacked=True, figsize=(8, 6), colormap='Set2')
    plt.title(f'Addiction Percentage by {feature.replace("_", " ").title()}')
    plt.xlabel(feature.replace('_', ' ').title())
    plt.ylabel('Percentage')
    plt.legend(title='Addiction Status')
    plt.xticks(rotation=30)
    plt.tight_layout()
    plt.show()

In [None]:
sns.heatmap(df[features].corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Matrix")
plt.show()


*Note:* No features are truly redundant, but some are highly related, social_media_usage, gaming_time, night_usage ⟶ part of daily_screen_time, and app_sessions might correlate with apps_installed

In [None]:
df['addicted'] = df['addicted'].map({'not addicted': 0, 'addicted': 1}).astype(int)

In [None]:
print(df['addicted'].unique())  # Should print: [1 0]

*Addicted column which its values are objects has been encoded to 1 and 0*

In [None]:
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Matrix (Including Addicted)")
plt.show()


*Note:* This heat map indicates that there is a correlation with addicted feature and all other features

# Models:

###      1- Logistic Regression

In [None]:
# Work on a copy to avoid modifying the original dataset
df_model = df.copy()

# Encode 'addicted' column (from 'addicted'/'not addicted' to 1/0)
df_model['addicted'] = df_model['addicted'].replace({'addicted': 1, 'not addicted': 0}).astype(int)

# Drop non-numeric columns (like binned categorical features)
df_model = df_model.select_dtypes(include=['number'])

# Define features and target
X = df_model.drop(columns='addicted')
y = df_model['addicted']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ىScale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression model
logreg = LogisticRegression(random_state=42, max_iter=1000)
logreg.fit(X_train_scaled, y_train)

# Make predictions
y_pred = logreg.predict(X_test_scaled)
y_proba = logreg.predict_proba(X_test_scaled)[:, 1]

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


*Logistic Regression Performance
Accuracy: 98%
Precision & Recall (both classes): 98%
Confusion Matrix: Only 26 false positives and 26 false negatives out of 2718 samples.
ROC AUC: Very likely near 0.98 (since not shown but implied)*



###      2- Random Forest Classifier

In [None]:
# Train Random Forest
rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]

# Evaluate
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, y_proba_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


Note: We're using the unscaled X_train and X_test because tree-based models like Random Forest don’t require feature scaling.

*Accuracy: 98.09% (very slightly better than Logistic Regression)
ROC AUC: 0.998 → significantly higher than Logistic Regression (better probability calibration)
Precision/Recall/F1: 98% for both classes
Confusion Matrix: 25 false positives and 27 false negatives*