# Import Libraries and Data

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s6e1/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s6e1/test.csv')

# Data Overview

In [None]:
def check(df):
    """
    Generates a concise summary of DataFrame columns.
    """
    # Compute values that are constant across columns
    total_rows = len(df)
    duplicates = df.duplicated().sum()

    # Use vectorized operations 
    dtypes = df.dtypes
    instances = df.count()
    unique = df.nunique()
    sum_null = df.isnull().sum()
    #null_percentage = (df.isnull().sum() / total_rows * 100).round(2)

    # Create the summary 
    df_check = pd.DataFrame({
        #'column': df.columns,
        'dtype': dtypes,
        'instances': instances,
        'unique': unique,
        'sum_null': sum_null,
        #'null_percentage': null_percentage,
        'duplicates': duplicates  
    })

    return df_check

print("Train Data")
display(check(train))
display(train.head())

print("Test Data")
display(check(test))
display(test.head())

# Explore Data and Insights

In [None]:
# Set style
sns.set_style("whitegrid")
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

# Distribution of Exam Scores
plt.figure()
sns.histplot(train['exam_score'], kde=True, bins=50, color='#4c72b0', alpha=0.8)
plt.title('Distribution of Exam Scores', fontsize=18, fontweight='bold', pad=20)
plt.xlabel('Exam Score')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# Exam Score by Gender
plt.figure()
sns.boxplot(x='gender', y='exam_score', data=train, palette="Set2", linewidth=2.0)
sns.stripplot(x='gender', y='exam_score', data=train, color='black', alpha=0.2, jitter=False)
plt.title('Exam Score Distribution by Gender', fontsize=18, fontweight='bold', pad=20)
plt.xlabel('Gender')
plt.ylabel('Exam Score')
plt.tight_layout()
plt.show()


# Average Exam Score by Exam Difficulty
difficulty_order = train.groupby('exam_difficulty')['exam_score'].mean().sort_values(ascending=False).index
plt.figure()
sns.barplot(
    x='exam_difficulty', y='exam_score', data=train,
    order=difficulty_order, palette="viridis", ci=None
)
plt.title('Average Exam Score by Perceived Exam Difficulty', fontsize=18, fontweight='bold', pad=20)
plt.xlabel('Exam Difficulty')
plt.ylabel('Average Exam Score')
plt.tight_layout()
plt.show()

# Average Exam Score by Sleep Quality
sleep_order = train.groupby('sleep_quality')['exam_score'].mean().sort_values(ascending=False).index
plt.figure()
sns.barplot(
    x='sleep_quality', y='exam_score', data=train,
    order=sleep_order, palette="plasma", ci=None
)
plt.title('Average Exam Score by Sleep Quality', fontsize=18, fontweight='bold', pad=20)
plt.xlabel('Sleep Quality')
plt.ylabel('Average Exam Score')
plt.tight_layout()
plt.show()

In [None]:
# Exam Score vs. Study Hours (with trend line)
plt.figure()
sns.regplot(
    x='study_hours', y='exam_score', data=train,
    scatter_kws={'alpha': 0.6, 'color': '#55a868'},
    line_kws={'color': 'red', 'linewidth': 2}
)
plt.title('Exam Score vs. Study Hours', fontsize=18, fontweight='bold', pad=20)
plt.xlabel('Study Hours per Week')
plt.ylabel('Exam Score')
plt.tight_layout()
plt.show()

# Exam Score vs. Class Attendance (with trend line)
plt.figure()
sns.regplot(
    x='class_attendance', y='exam_score', data=train,
    scatter_kws={'alpha': 0.6, 'color': '#c44e52'},
    line_kws={'color': 'red', 'linewidth': 2}
)
plt.title('Exam Score vs. Class Attendance', fontsize=18, fontweight='bold', pad=20)
plt.xlabel('Class Attendance (%)')
plt.ylabel('Exam Score')
plt.tight_layout()
plt.show()