In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
base_folder = "/content/drive/MyDrive/work/Finalproject/housing_app_fall25-main"
%cd "{base_folder}"

/content/drive/MyDrive/work/Finalproject/housing_app_fall25-main


In [8]:
# =============================================================================
# 01_create_database.ipynb
# Build normalized SQLite database from student_performance.csv
# Classification dataset (PASS / FAIL)
# =============================================================================

from pathlib import Path
import sqlite3
import pandas as pd

# -----------------------------------------------------------------------------
# Paths
# -----------------------------------------------------------------------------
BASE_DIR = Path("/content/drive/MyDrive/work/Finalproject/housing_app_fall25-main")
DATA_DIR = BASE_DIR / "data"

CSV_PATH = DATA_DIR / "student_performance.csv"
DB_PATH = DATA_DIR / "student_performance.db"

print("Base folder:", BASE_DIR)
print("CSV:", CSV_PATH)

if not CSV_PATH.exists():
    raise FileNotFoundError(f"CSV not found at {CSV_PATH}")

# -----------------------------------------------------------------------------
# Load CSV
# -----------------------------------------------------------------------------
df = pd.read_csv(CSV_PATH)
print(f"Rows loaded: {len(df)}")

display(df.head())

# -----------------------------------------------------------------------------
# Create derived target (PASS / FAIL)
# -----------------------------------------------------------------------------
df["pass_fail"] = (df["exam_score"] >= 40).map({True: "pass", False: "fail"})

print("Target distribution:")
print(df["pass_fail"].value_counts(normalize=True))

# -----------------------------------------------------------------------------
# Normalize tables (3NF)
# -----------------------------------------------------------------------------

# STUDENTS table
students = df[
    [
        "student_id",
        "age",
        "gender",
        "internet_access",
        "sleep_hours",
        "sleep_quality",
        "class_attendance",
    ]
].drop_duplicates()

# COURSES table
courses = (
    df[["course", "study_method", "facility_rating"]]
    .drop_duplicates()
    .reset_index(drop=True)
)
courses["course_id"] = courses.index + 1

# Map course → course_id
df = df.merge(courses, on=["course", "study_method", "facility_rating"], how="left")

# EXAMS table
exams = df[
    [
        "student_id",
        "course_id",
        "exam_difficulty",
        "study_hours",
        "exam_score",
        "pass_fail",
    ]
]

# -----------------------------------------------------------------------------
# Create SQLite DB
# -----------------------------------------------------------------------------
if DB_PATH.exists():
    DB_PATH.unlink()
    print("Old database removed.")

conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()

# -----------------------------------------------------------------------------
# SQL Schema
# -----------------------------------------------------------------------------
cur.executescript(
    """
    CREATE TABLE students (
        student_id INTEGER PRIMARY KEY,
        age INTEGER,
        gender TEXT,
        internet_access TEXT,
        sleep_hours REAL,
        sleep_quality TEXT,
        class_attendance REAL
    );

    CREATE TABLE courses (
        course_id INTEGER PRIMARY KEY,
        course_name TEXT,
        study_method TEXT,
        facility_rating TEXT
    );

    CREATE TABLE exams (
        exam_id INTEGER PRIMARY KEY AUTOINCREMENT,
        student_id INTEGER,
        course_id INTEGER,
        exam_difficulty TEXT,
        study_hours REAL,
        exam_score REAL,
        pass_fail TEXT,
        FOREIGN KEY (student_id) REFERENCES students(student_id),
        FOREIGN KEY (course_id) REFERENCES courses(course_id)
    );
    """
)

# -----------------------------------------------------------------------------
# Insert Data
# -----------------------------------------------------------------------------
cur.executemany(
    """
    INSERT INTO students VALUES (?, ?, ?, ?, ?, ?, ?)
    """,
    list(students.itertuples(index=False, name=None)),
)

cur.executemany(
    """
    INSERT INTO courses VALUES (?, ?, ?, ?)
    """,
    list(
        courses[["course_id", "course", "study_method", "facility_rating"]]
        .itertuples(index=False, name=None)
    ),
)

cur.executemany(
    """
    INSERT INTO exams (
        student_id, course_id, exam_difficulty,
        study_hours, exam_score, pass_fail
    )
    VALUES (?, ?, ?, ?, ?, ?)
    """,
    list(exams.itertuples(index=False, name=None)),
)

conn.commit()
conn.close()

print("\n✅ DATABASE CREATED SUCCESSFULLY")
print("DB Path:", DB_PATH)
print("Tables: students, courses, exams")


Base folder: /content/drive/MyDrive/work/Finalproject/housing_app_fall25-main
CSV: /content/drive/MyDrive/work/Finalproject/housing_app_fall25-main/data/student_performance.csv
Rows loaded: 20000


Unnamed: 0,student_id,age,gender,course,study_hours,class_attendance,internet_access,sleep_hours,sleep_quality,study_method,facility_rating,exam_difficulty,exam_score
0,1,17,male,diploma,2.78,92.9,yes,7.4,poor,coaching,low,hard,58.9
1,2,23,other,bca,3.37,64.8,yes,4.6,average,online videos,medium,moderate,54.8
2,3,22,male,b.sc,7.88,76.8,yes,8.5,poor,coaching,high,moderate,90.3
3,4,20,other,diploma,0.67,48.4,yes,5.8,average,online videos,low,moderate,29.7
4,5,20,female,diploma,0.89,71.6,yes,9.8,poor,coaching,low,moderate,43.7


Target distribution:
pass_fail
pass    0.87125
fail    0.12875
Name: proportion, dtype: float64

✅ DATABASE CREATED SUCCESSFULLY
DB Path: /content/drive/MyDrive/work/Finalproject/housing_app_fall25-main/data/student_performance.db
Tables: students, courses, exams
