# Data Generation Notebook
This notebook demonstrates the data generation process for the student dataset.

In [1]:
# Import required libraries
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Add src directory to path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

from data_generator import StudentGenerator

## Initialize the Data Generator

In [2]:
# Initialize the generator with 500 students
generator = StudentGenerator(n_students=500)
print("Student generator initialized successfully!")

Student generator initialized successfully!


## Generate Sample Data

In [8]:
# Generate student data
students_df = generator.generate_dataset()

# Display the first few rows
students_df.head()

Generating 500 student records...
Introducing missing values...
Dataset saved to: C:\python\NCI_programming_ca1\data\students_raw.csv
Sample data:
  student_id first_name last_name  gender                  email  age  \
0       S001   Benjamin   Kiernan    Male  x001@student.ncirl.ie   20   
1       S002     Callan  McVicker    Male  x002@student.ncirl.ie   25   
2       S003     Fintan     Deere    Male  x003@student.ncirl.ie   20   
3       S004     Sorcha     Cally  Female  x004@student.ncirl.ie   21   
4       S005       Kane     Swift    Male  x005@student.ncirl.ie   29   

   study_hours  quiz_participation  past_performance  course_completion  
0        12.14                89.0              95.0               <NA>  
1         9.37                86.9              56.0               True  
2        12.79                82.7              60.0              False  
3         9.28                65.6              57.0              False  
4         7.78                95.8          

Unnamed: 0,student_id,first_name,last_name,gender,email,age,study_hours,quiz_participation,past_performance,course_completion
0,S001,Benjamin,Kiernan,Male,x001@student.ncirl.ie,20,12.14,89.0,95.0,
1,S002,Callan,McVicker,Male,x002@student.ncirl.ie,25,9.37,86.9,56.0,True
2,S003,Fintan,Deere,Male,x003@student.ncirl.ie,20,12.79,82.7,60.0,False
3,S004,Sorcha,Cally,Female,x004@student.ncirl.ie,21,9.28,65.6,57.0,False
4,S005,Kane,Swift,Male,x005@student.ncirl.ie,29,7.78,95.8,51.0,True


## Data Overview

In [9]:
# Basic information about the dataset
print("Dataset Info:")
students_df.info()

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   student_id          500 non-null    object 
 1   first_name          495 non-null    object 
 2   last_name           493 non-null    object 
 3   gender              497 non-null    object 
 4   email               497 non-null    object 
 5   age                 500 non-null    int64  
 6   study_hours         468 non-null    float64
 7   quiz_participation  467 non-null    float64
 8   past_performance    480 non-null    float64
 9   course_completion   488 non-null    boolean
dtypes: boolean(1), float64(3), int64(1), object(5)
memory usage: 36.3+ KB


In [10]:
# Basic statistics
print("\nBasic Statistics:")
students_df.describe(include='all')


Basic Statistics:


Unnamed: 0,student_id,first_name,last_name,gender,email,age,study_hours,quiz_participation,past_performance,course_completion
count,500,495,493,497,497,500.0,468.0,467.0,480.0,488
unique,500,362,441,2,497,,,,,2
top,S001,Clifford,MacMullen,Male,x001@student.ncirl.ie,,,,,True
freq,1,4,3,256,1,,,,,344
mean,,,,,,22.152,12.278445,75.313471,69.20625,
std,,,,,,2.131192,16.978778,19.164466,19.79438,
min,,,,,,19.0,-4.805869,-7.676358,-9.0,
25%,,,,,,20.0,6.985,63.1,59.0,
50%,,,,,,22.0,9.825,75.5,70.0,
75%,,,,,,24.0,12.9425,88.3,82.0,


## Save the Generated Data

In [11]:
# Ensure the data directory exists
os.makedirs('../data/raw', exist_ok=True)

# Save to CSV
output_path = '../data/raw/students_raw.csv'
students_df.to_csv(output_path, index=False)
print(f"Data saved to {output_path}")

Data saved to ../data/raw/students_raw.csv
