### Question 1

Check Below
- DataFrame Shape
- Data Types
- Duplicate records (remove if found)
- Missing/Null Values (replace with median value)

In [None]:
import pandas as pd

# Manually cleaned column names
df.columns = [
    "ID", "Name", "Age", "Gender", "Location", "School_Grade", "Daily_Usage_Hours",
    "Sleep_Hours", "Academic_Performance", "Social_Interactions", "Exercise_Hours",
    "Anxiety_Level", "Depression_Level", "Self_Esteem", "Parental_Control", "Screen_Time_Before_Bed"
]

# Convert to correct data types
numeric_cols = [
    "Age", "Daily_Usage_Hours", "Sleep_Hours", "Academic_Performance", "Social_Interactions",
    "Exercise_Hours", "Anxiety_Level", "Depression_Level", "Self_Esteem",
    "Parental_Control", "Screen_Time_Before_Bed"
]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Remove duplicates
df = df.drop_duplicates()

# Fill missing values with median
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Display output
print(" Shape:", df.shape)
print(" Data Types:\n", df.dtypes)
print(" Nulls:\n", df[numeric_cols].isnull().sum())


### Question 2:

How does time spent on social media compare to time spent on education among teens?

- Plot side-by-side histograms of Time on Social Media and Time on Education
- Find and compare the median time for both.
- Write simple comments to explain what the data shows

In [None]:
df['Time_on_Social_Media'] = pd.to_numeric(df['Time_on_Social_Media'], errors='coerce')
df['Time_on_Education'] = pd.to_numeric(df['Time_on_Education'], errors='coerce')

import matplotlib.pyplot as plt

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
df['Time_on_Social_Media'].hist(color='skyblue', edgecolor='black')
plt.title('Time on Social Media')
plt.xlabel('Hours')

plt.subplot(1, 2, 2)
df['Time_on_Education'].hist(color='orange', edgecolor='black')
plt.title('Time on Education')
plt.xlabel('Hours')

plt.tight_layout()
plt.show()

print("Median Time on Social Media:", df['Time_on_Social_Media'].median())
print("Median Time on Education:", df['Time_on_Education'].median())


### Question 3

- How can we detect and identify outliers in Daily Usage Hours, Exercise Hours, and Screen Time Before Bed among teens using the IQR method?
- Display the records corresponding to these outliers using boxplot.



In [None]:
import seaborn as sns

def detect_outliers_iqr(column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    return df[(df[column] < Q1 - 1.5 * IQR) | (df[column] > Q3 + 1.5 * IQR)]

for col in ['Daily_Usage_Hours', 'Exercise_Hours', 'Screen_Time_Before_Bed']:
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot for {col}")
    plt.show()


### Question 4

Analyze the correlation heatmap showing relationships between teen behaviors, screen usage, and mental health indicators.

Focus your attention on the following numeric variables:

- Daily_Usage_Hours, Sleep_Hours, Academic_Performance, Addiction_Level, Time_on_Social_Media,

- Time_on_Gaming, Time_on_Education, Apps_Used_Daily, Anxiety_Level, Depression_Level,

- Self_Esteem, Social_Interactions, Exercise_Hours, Screen_Time_Before_Bed,

- Phone_Checks_Per_Day, Family_Communication, Weekend_Usage_Hours


Identify two strong positive and two strong negative correlation involving any of the above variables.

Explain briefly what these correlations might suggest about teen behavior, wellbeing, or digital habits.

Use specific values from the heatmap to support your observation.



In [None]:
selected_cols = [
    'Daily_Usage_Hours', 'Sleep_Hours', 'Academic_Performance', 'Addiction_Level',
    'Time_on_Social_Media', 'Time_on_Gaming', 'Time_on_Education', 'Apps_Used_Daily',
    'Anxiety_Level', 'Depression_Level', 'Self_Esteem', 'Social_Interactions',
    'Exercise_Hours', 'Screen_Time_Before_Bed', 'Phone_Checks_Per_Day',
    'Family_Communication', 'Weekend_Usage_Hours'
]

for col in selected_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

import seaborn as sns
plt.figure(figsize=(14, 10))
sns.heatmap(df[selected_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()


### Question 5

How does the average addiction level vary across different age groups? Show with line graph

In [None]:
df['Addiction_Level'] = pd.to_numeric(df['Addiction_Level'], errors='coerce')
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

df['Age_Group'] = pd.cut(df['Age'], bins=[10, 13, 16, 19, 22], labels=['10-13', '14-16', '17-19', '20+'])

group_avg = df.groupby('Age_Group')['Addiction_Level'].mean().reset_index()

sns.lineplot(data=group_avg, x='Age_Group', y='Addiction_Level', marker='o')
plt.title('Addiction Level by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Avg Addiction Level')
plt.grid(True)
plt.show()
