In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif

In [121]:
file_path = 'South_East_Asia_Social_Media_MentalHealth.csv'
data = pd.read_csv(file_path)

data.head().T

Unnamed: 0,0,1,2,3,4
Country,Myanmar,Indonesia,Vietnam,Philippines,Laos
Age Group,18-25,18-25,36-45,26-35,56-65
Gender,Male,Female,Male,Male,Male
Urban/Rural,Rural,Urban,Rural,Urban,Rural
Daily SM Usage (hrs),2.89,3.24,7.84,1.25,3.94
Most Used SM Platform,WeChat,Instagram,Instagram,WeChat,Facebook
Frequency of SM Use,Weekly,Rarely,Daily,Rarely,Daily
Likes Received (per post),652,295,412,387,148
Comments Received (per post),80,429,64,309,45
Shares Received (per post),226,45,2,273,180


In [122]:
# 1. Checking for missing values
print("Missing Values in each column:")
data.isnull().sum()

In [123]:
# Handling missing values by imputing median for numerical columns
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_columns:
    data[col] = data[col].fillna(data[col].median())

# Handling missing values by imputing mode for categorical columns
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].fillna(data[col].mode()[0])

print("Missing Values After Handling:")
data.isnull().sum()

In [124]:
# 2. Detecting and Handling Outliers
# Use boxplots to visualize outliers in numerical columns
def detect_outliers_iqr(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = series[(series < lower_bound) | (series > upper_bound)]
    return outliers

In [125]:
# Numerical columns
numerical_columns = ['Daily SM Usage (hrs)', 'Likes Received (per post)', 'Comments Received (per post)', 
                     'Shares Received (per post)', 'Peer Comparison Frequency (1-10)', 
                     'Social Anxiety Level (1-10)', 'Body Image Impact (1-10)', 
                     'Sleep Quality Impact (1-10)', 'Self Confidence Impact (1-10)', 
                     'Cyberbullying Experience (1-10)', 'Anxiety Levels (1-10)']

# Create subplots
n_cols = 3
n_rows = (len(numerical_columns) // n_cols) + (len(numerical_columns) % n_cols > 0)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(10, 2 * n_rows))
axes = axes.flatten()

for i, col in enumerate(numerical_columns):
    if col in data.columns:
        sns.boxplot(x=data[col], ax=axes[i])
        axes[i].set_title(f'Boxplot for {col}')
        
        outliers = detect_outliers_iqr(data[col].dropna())
        print(f"{col} has {len(outliers)} outliers")

plt.tight_layout()
plt.show()

In [126]:
Handle outliers using IQR if there are any
for col in numerical_columns:
   Q1 = data[col].quantile(0.25)
   Q3 = data[col].quantile(0.75)
   IQR = Q3 - Q1
   lower_bound = Q1 - 1.5 * IQR
   upper_bound = Q3 + 1.5 * IQR
   data[col] = np.where(data[col] < lower_bound, lower_bound, data[col])
   data[col] = np.where(data[col] > upper_bound, upper_bound, data[col])

In [127]:
# Save the preprocessed data
data.to_csv('Preprocessed_Dataset.csv', index=False)

file_path = 'Preprocessed_Dataset.csv'
data = pd.read_csv(file_path)

In [128]:
# data.head()

In [129]:
# 3. Data Transformation
#Categorize the 'Age Group' column into 'Youth', 'Adult', and 'Senior' categories
data['Age Category'] = data['Age Group'].apply(lambda age: 'Youth' if age in ['18-25', '26-35'] else ('Adult' if age in ['36-45', '46-55'] else 'Senior'))

print("\nDataset with 'Age Category' column:")
data[['Age Group', 'Age Category']].head()

#Combine 'Likes Received (per post)', 'Comments Received (per post)', and 'Shares Received (per post)' into 'Total Social Interaction'
data['Total Social Interaction'] = data['Likes Received (per post)'] + data['Comments Received (per post)'] + data['Shares Received (per post)']

data[['Likes Received (per post)', 'Comments Received (per post)', 'Shares Received (per post)', 'Total Social Interaction']].head()

# Create a new column 'Usage Intensity' based on 'Daily SM Usage (hrs)'
data['Usage Intensity'] = data['Daily SM Usage (hrs)'].apply(lambda hours: 'Low' if hours < 2 else 'Medium' if hours < 5 else 'High')

print("\nDataset with 'Usage Intensity' column:")
data[['Daily SM Usage (hrs)', 'Usage Intensity']].head()


Dataset with 'Age Category' column:

Dataset with 'Usage Intensity' column:


Unnamed: 0,Daily SM Usage (hrs),Usage Intensity
0,2.89,Medium
1,3.24,Medium
2,7.84,High
3,1.25,Low
4,3.94,Medium


In [130]:
# Categorize Social Anxiety Level into Low, Medium, High
data['Social_Anxiety_Category'] = pd.cut(data['Social Anxiety Level (1-10)'], bins=[0, 3, 7, 10], 
                                         labels=['Low', 'Medium', 'High'], right=True)

# Check the distribution of the new target variable
print("Class distribution in the new target variable:")
print(data['Social_Anxiety_Category'].value_counts())

Class distribution in the new target variable:
Social_Anxiety_Category
Medium    136027
Low       102842
High      102047
Name: count, dtype: int64


In [131]:
print("\nDataset with 'Social_Anxiety_Category' column:")
data[['Social Anxiety Level (1-10)', 'Social_Anxiety_Category']].head()


Dataset with 'Social_Anxiety_Category' column:


Unnamed: 0,Social Anxiety Level (1-10),Social_Anxiety_Category
0,1,Low
1,2,Low
2,10,High
3,7,Medium
4,10,High


In [150]:
# Manually encode the categories with desired values: Low = 0, Medium = 1, High = 2
category_mapping = {'Low': 0, 'Medium': 1, 'High': 2}
data['Social_Anxiety_Category'] = data['Social_Anxiety_Category'].map(category_mapping)

# Display encoding map
print(f"\nEncoding for 'Social_Anxiety_Category':")
encoding_map_sac = pd.DataFrame({
    'Original': category_mapping.keys(),
    'Encoded': category_mapping.values()
})
print(encoding_map_sac)

Class distribution in the new target variable:
Series([], Name: count, dtype: int64)

Encoding for 'Social_Anxiety_Category':
  Original  Encoded
0      Low        0
1   Medium        1
2     High        2


In [133]:
data = data.drop(columns=['Social Anxiety Level (1-10)'])

In [134]:
# 3. Data Transformation
# Data Encoding for Categorical Variables
# # Initialize the LabelEncoder
encoder = LabelEncoder()

# Define columns to encode (combining the columns from both snippets)
encode_columns = [
    'Urban/Rural', 'Gender', 'Frequency of SM Use', 'Education Level', 
    'Country', 'Socioeconomic Status', 'State', 'Age Category', 'Usage Intensity',
    'Most Used SM Platform'
]

encoder = LabelEncoder()
# Create a dictionary to store encoding maps
encoding_maps = {}

print("\n============================== Data Encoding ==============================\n")

for column in encode_columns:
    data[column] = encoder.fit_transform(data[column])
    
    encoding_maps[column] = dict(zip(encoder.classes_, range(len(encoder.classes_))))
    
    print(f"\n------------------------------- Encoding for '{column}' -------------------------------")
    encoding_map = pd.DataFrame({
        'Original': encoder.classes_,
        'Encoded': range(len(encoder.classes_))
    })
    
    print(encoding_map.to_string(index=False))
    print("\n---------------------------------------------------------------------------------------------")

print("\n=============================== Encoding Completed ===============================\n")




------------------------------- Encoding for 'Urban/Rural' -------------------------------
Original  Encoded
   Rural        0
   Urban        1

---------------------------------------------------------------------------------------------

------------------------------- Encoding for 'Gender' -------------------------------
Original  Encoded
  Female        0
    Male        1

---------------------------------------------------------------------------------------------

------------------------------- Encoding for 'Frequency of SM Use' -------------------------------
Original  Encoded
   Daily        0
 Monthly        1
  Rarely        2
  Weekly        3

---------------------------------------------------------------------------------------------

------------------------------- Encoding for 'Education Level' -------------------------------
   Original  Encoded
 Bachelor's        0
  Doctorate        1
High School        2
   Master's        3

----------------------------------

In [135]:
data.head().T

Unnamed: 0,0,1,2,3,4
Country,6,3,10,7,4
Age Group,18-25,18-25,36-45,26-35,56-65
Gender,1,0,1,1,1
Urban/Rural,0,1,0,1,0
Daily SM Usage (hrs),2.89,3.24,7.84,1.25,3.94
Most Used SM Platform,4,1,1,4,0
Frequency of SM Use,3,2,0,2,0
Likes Received (per post),652,295,412,387,148
Comments Received (per post),80,429,64,309,45
Shares Received (per post),226,45,2,273,180


In [136]:
data['Usage Anxiety Interaction'] = data['Daily SM Usage (hrs)'] * data['Social_Anxiety_Category']

In [137]:
print("\nDataset with 'Usage Anxiety Interaction columns:")
data[['Daily SM Usage (hrs)','Social_Anxiety_Category','Usage Anxiety Interaction']].head()


Dataset with 'Usage Anxiety Interaction columns:


Unnamed: 0,Daily SM Usage (hrs),Social_Anxiety_Category,Usage Anxiety Interaction
0,2.89,1,2.89
1,3.24,1,3.24
2,7.84,0,0.0
3,1.25,2,2.5
4,3.94,0,0.0


In [138]:
# data =data.drop(columns=['Urban/Rural', 'Daily SM Usage (hrs)', 'Education Level'])
data = data.drop(columns=['Age Group'])

In [139]:
from sklearn.model_selection import train_test_split

X = data.drop(columns=['Social_Anxiety_Category'])
y = data['Social_Anxiety_Category']

# Split the data into 80% training (which includes training + validation) and 20% testing
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=54)  

# From the 80% training data, split it into 75% training and 25% validation
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=54) 

# Combine features and target for each set to save to CSV
train_data = X_train.copy()
train_data['Social_Anxiety_Category'] = y_train

val_data = X_val.copy()
val_data['Social_Anxiety_Category'] = y_val

test_data = X_test.copy()
test_data['Social_Anxiety_Category'] = y_test

In [140]:
# Display the features (X) and target variable (y)
print("Features (X):")
print(X.head())  # Shows the first 5 rows of the features

print("\nTarget Variable (y):")
print(y.head())  # Shows the first 5 rows of the target variable

Features (X):
   Country  Gender  Urban/Rural  Daily SM Usage (hrs)  Most Used SM Platform  \
0        6       1            0                  2.89                      4   
1        3       0            1                  3.24                      1   
2       10       1            0                  7.84                      1   
3        7       1            1                  1.25                      4   
4        4       1            0                  3.94                      0   

   Frequency of SM Use  Likes Received (per post)  \
0                    3                        652   
1                    2                        295   
2                    0                        412   
3                    2                        387   
4                    0                        148   

   Comments Received (per post)  Shares Received (per post)  \
0                            80                         226   
1                           429                          45 

In [141]:
train_data.head()

Unnamed: 0,Country,Gender,Urban/Rural,Daily SM Usage (hrs),Most Used SM Platform,Frequency of SM Use,Likes Received (per post),Comments Received (per post),Shares Received (per post),Peer Comparison Frequency (1-10),...,Body Image Impact (1-10),Sleep Quality Impact (1-10),Self Confidence Impact (1-10),Cyberbullying Experience (1-10),Anxiety Levels (1-10),Age Category,Total Social Interaction,Usage Intensity,Usage Anxiety Interaction,Social_Anxiety_Category
188503,7,1,0,10.04,2,3,90,464,222,9,...,10,9,8,9,5,2,776,0,10.04,1
164774,8,0,1,11.15,2,2,170,19,292,6,...,8,10,4,8,7,0,481,0,11.15,1
21340,5,0,0,4.85,1,0,189,246,218,6,...,8,7,2,7,8,0,653,2,0.0,0
325012,8,0,0,6.63,4,1,48,87,172,2,...,1,1,2,1,3,1,307,0,6.63,1
53462,9,1,0,11.73,2,3,459,429,230,2,...,5,9,10,5,6,2,1118,0,23.46,2


In [142]:
# Separate features and target variable
X_train = train_data.drop(columns=['Social_Anxiety_Category'])
y_train = train_data['Social_Anxiety_Category']

X_val = val_data.drop(columns=['Social_Anxiety_Category'])
y_val = val_data['Social_Anxiety_Category']

X_test = test_data.drop(columns=['Social_Anxiety_Category'])
y_test = test_data['Social_Anxiety_Category']

In [143]:
train_data.head()

Unnamed: 0,Country,Gender,Urban/Rural,Daily SM Usage (hrs),Most Used SM Platform,Frequency of SM Use,Likes Received (per post),Comments Received (per post),Shares Received (per post),Peer Comparison Frequency (1-10),...,Body Image Impact (1-10),Sleep Quality Impact (1-10),Self Confidence Impact (1-10),Cyberbullying Experience (1-10),Anxiety Levels (1-10),Age Category,Total Social Interaction,Usage Intensity,Usage Anxiety Interaction,Social_Anxiety_Category
188503,7,1,0,10.04,2,3,90,464,222,9,...,10,9,8,9,5,2,776,0,10.04,1
164774,8,0,1,11.15,2,2,170,19,292,6,...,8,10,4,8,7,0,481,0,11.15,1
21340,5,0,0,4.85,1,0,189,246,218,6,...,8,7,2,7,8,0,653,2,0.0,0
325012,8,0,0,6.63,4,1,48,87,172,2,...,1,1,2,1,3,1,307,0,6.63,1
53462,9,1,0,11.73,2,3,459,429,230,2,...,5,9,10,5,6,2,1118,0,23.46,2


In [144]:
y_train.unique()

array([1, 0, 2])

In [146]:
from sklearn.feature_selection import SelectKBest, f_classif

# Perform SelectKBest feature selection
selector = SelectKBest(score_func=f_classif, k=15)
X_new = selector.fit_transform(X_train, y_train)

# Get the F-values of all features
f_values = selector.scores_

# Get the selected and non-selected features
selected_mask = selector.get_support()

# Get selected feature names and their F-values
selected_f_values = {X_train.columns[i]: f_values[i] for i in range(len(f_values)) if selected_mask[i]}

# Get non-selected feature names and their F-values
non_selected_f_values = {X_train.columns[i]: f_values[i] for i in range(len(f_values)) if not selected_mask[i]}

# Display selected and non-selected F-values
print("F-values of selected features:")
for feature, f_value in selected_f_values.items():
    print(f"{feature}: {f_value}")

print("\nF-values of non-selected features:")
for feature, f_value in non_selected_f_values.items():
    print(f"{feature}: {f_value}")

F-values of selected features:
Likes Received (per post): 0.8982514851585469
Comments Received (per post): 0.3771619197093208
Peer Comparison Frequency (1-10): 0.35231166135274306
Socioeconomic Status: 0.6611652068093375
Education Level: 1.8011654928155885
State: 1.9067922772214703
Body Image Impact (1-10): 2.000372103305583
Sleep Quality Impact (1-10): 1.6662372745469354
Self Confidence Impact (1-10): 1.0478803437382562
Cyberbullying Experience (1-10): 1.1002503950766067
Anxiety Levels (1-10): 0.4031450255167098
Age Category: 1.8519698730225083
Total Social Interaction: 0.9750823553891216
Usage Intensity: 0.5758501226879487
Usage Anxiety Interaction: 131359.54665592345

F-values of non-selected features:
Country: 0.15344681727973128
Gender: 0.17285414489704973
Urban/Rural: 0.12321875514560637
Daily SM Usage (hrs): 0.007748888597521775
Most Used SM Platform: 0.15527970885509468
Frequency of SM Use: 0.14010150209803252
Shares Received (per post): 0.28427359100503785


In [147]:
# Remove 'State' from the selected features
selected_features = [feature for feature in selected_features if feature != 'State']

X_selected = X[selected_features]

# Print the updated selected features
print("Updated Selected features:", selected_features)

Updated Selected features: ['Likes Received (per post)', 'Comments Received (per post)', 'Peer Comparison Frequency (1-10)', 'Socioeconomic Status', 'Education Level', 'Body Image Impact (1-10)', 'Sleep Quality Impact (1-10)', 'Self Confidence Impact (1-10)', 'Cyberbullying Experience (1-10)', 'Anxiety Levels (1-10)', 'Age Category', 'Total Social Interaction', 'Usage Intensity', 'Usage Anxiety Interaction']


In [149]:
X_selected.head().T

Unnamed: 0,0,1,2,3,4
Likes Received (per post),652.0,295.0,412.0,387.0,148.0
Comments Received (per post),80.0,429.0,64.0,309.0,45.0
Peer Comparison Frequency (1-10),1.0,8.0,8.0,1.0,10.0
Socioeconomic Status,0.0,0.0,0.0,2.0,1.0
Education Level,2.0,1.0,0.0,2.0,0.0
Body Image Impact (1-10),7.0,6.0,3.0,2.0,9.0
Sleep Quality Impact (1-10),1.0,4.0,8.0,3.0,2.0
Self Confidence Impact (1-10),8.0,5.0,6.0,3.0,6.0
Cyberbullying Experience (1-10),6.0,3.0,2.0,10.0,3.0
Anxiety Levels (1-10),6.0,3.0,3.0,2.0,6.0


In [92]:
# Split the data into 80% training (which includes training + validation) and 20% testing
X_train_full, X_test, y_train_full, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=54)  

# From the 80% training data, split it into 75% training and 25% validation
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=54)  

# Combine features and target for each set to save to CSV
train_data = X_train.copy()
train_data['Social_Anxiety_Category'] = y_train

val_data = X_val.copy()
val_data['Social_Anxiety_Category'] = y_val

test_data = X_test.copy()
test_data['Social_Anxiety_Category'] = y_test


In [93]:
print("Training Dataset Size:",train_data.shape)
print("Validation Dataset Size:",val_data.shape)
print("Testing Dataset Size:",test_data.shape)

Training Dataset Size: (204549, 15)
Validation Dataset Size: (68183, 15)
Testing Dataset Size: (68184, 15)


In [94]:
# Save the datasets into CSV files
train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

print("Datasets have been split and saved as 'train_data.csv', 'val_data.csv', and 'test_data.csv'.")

Datasets have been split and saved as 'train_data.csv', 'val_data.csv', and 'test_data.csv'.
