In [1]:
# Install PySpark in Google Colab
!pip install pyspark



In [3]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, mean, when, count, regexp_replace
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [4]:
# Initialize Spark Session
spark = SparkSession.builder.appName("Lab3").getOrCreate()

In [2]:
# Upload CSV file
from google.colab import files
print("Please upload your dataset (CSV file)")
uploaded = files.upload()

Please upload your dataset (CSV file)


Saving Student_performance_data _.csv to Student_performance_data _.csv


In [6]:
file_name = list(uploaded.keys())[0]
print(f"\n✅ Uploaded File: {file_name}")


✅ Uploaded File: Student_performance_data _.csv


In [7]:
# Load CSV into Spark DataFrame
df = spark.read.csv(file_name, header=True, inferSchema=True)

In [8]:
# Show first few rows
print("\n📊 First 5 Rows of Dataset:")
df.show(5)


📊 First 5 Rows of Dataset:
+---------+---+------+---------+-----------------+------------------+--------+--------+---------------+---------------+------+-----+------------+------------------+----------+
|StudentID|Age|Gender|Ethnicity|ParentalEducation|   StudyTimeWeekly|Absences|Tutoring|ParentalSupport|Extracurricular|Sports|Music|Volunteering|               GPA|GradeClass|
+---------+---+------+---------+-----------------+------------------+--------+--------+---------------+---------------+------+-----+------------+------------------+----------+
|     1001| 17|     1|        0|                2|19.833722807854713|       7|       1|              2|              0|     0|    1|           0| 2.929195591667681|       2.0|
|     1002| 18|     0|        0|                1| 15.40875605584674|       0|       0|              1|              0|     0|    0|           0| 3.042914833436377|       1.0|
|     1003| 15|     0|        2|                3|  4.21056976881226|      26|       0|     

In [9]:
# Check for missing values
print("\n🔍 Checking for Missing Values:")
missing_values = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])
missing_values.show()


🔍 Checking for Missing Values:
+---------+---+------+---------+-----------------+---------------+--------+--------+---------------+---------------+------+-----+------------+---+----------+
|StudentID|Age|Gender|Ethnicity|ParentalEducation|StudyTimeWeekly|Absences|Tutoring|ParentalSupport|Extracurricular|Sports|Music|Volunteering|GPA|GradeClass|
+---------+---+------+---------+-----------------+---------------+--------+--------+---------------+---------------+------+-----+------------+---+----------+
|        0|  0|     0|        0|                0|              0|       0|       0|              0|              0|     0|    0|           0|  0|         0|
+---------+---+------+---------+-----------------+---------------+--------+--------+---------------+---------------+------+-----+------------+---+----------+



In [13]:
# Fill missing values
num_cols = [c[0] for c in df.dtypes if c[1] in ['int', 'double']]
for col_name in num_cols:
    mean_value = df.select(mean(col(col_name))).collect()[0][0]
    df = df.fillna({col_name: mean_value})

cat_cols = [c[0] for c in df.dtypes if c[1] == 'string']
for col_name in cat_cols:
    mode_value = df.groupBy(col_name).count().orderBy(col("count").desc()).first()[0]
    df = df.fillna({col_name: mode_value})

print("\n✅ Missing values filled!")


✅ Missing values filled!


In [14]:
# Convert column names to lowercase and replace spaces with underscores
for col_name in df.columns:
    df = df.withColumnRenamed(col_name, col_name.lower().replace(" ", "_"))




In [15]:
print("\n🔤 Updated Column Names:", df.columns)


🔤 Updated Column Names: ['studentid', 'age', 'gender', 'ethnicity', 'parentaleducation', 'studytimeweekly', 'absences', 'tutoring', 'parentalsupport', 'extracurricular', 'sports', 'music', 'volunteering', 'gpa', 'gradeclass']


In [16]:
# Encode categorical variables using one-hot encoding
cat_columns = ['gender', 'ethnicity', 'parentaleducation', 'parentalsupport', 'tutoring', 'extracurricular']
valid_columns = [col for col in cat_columns if col in df.columns]

if valid_columns:
    from pyspark.ml.feature import StringIndexer, OneHotEncoder
    from pyspark.ml import Pipeline

    indexers = [StringIndexer(inputCol=col, outputCol=col + "_index") for col in valid_columns]
    encoders = [OneHotEncoder(inputCol=col + "_index", outputCol=col + "_encoded") for col in valid_columns]

    pipeline = Pipeline(stages=indexers + encoders)
    df = pipeline.fit(df).transform(df)

    # Drop original categorical columns and index columns
    df = df.drop(*valid_columns, *[col + "_index" for col in valid_columns])

print("\n✅ Categorical Variables Encoded!")
df.show(5)



✅ Categorical Variables Encoded!
+---------+---+------------------+--------+------+-----+------------+------------------+----------+--------------+-----------------+-------------------------+-----------------------+----------------+-----------------------+
|studentid|age|   studytimeweekly|absences|sports|music|volunteering|               gpa|gradeclass|gender_encoded|ethnicity_encoded|parentaleducation_encoded|parentalsupport_encoded|tutoring_encoded|extracurricular_encoded|
+---------+---+------------------+--------+------+-----+------------+------------------+----------+--------------+-----------------+-------------------------+-----------------------+----------------+-----------------------+
|     1001| 17|19.833722807854713|       7|     0|    1|           0| 2.929195591667681|       2.0| (1,[0],[1.0])|    (3,[0],[1.0])|            (4,[0],[1.0])|          (4,[0],[1.0])|       (1,[],[])|          (1,[0],[1.0])|
|     1002| 18| 15.40875605584674|       0|     0|    0|           0| 

In [17]:
# Compute average GPA by gender if columns exist
if "gpa" in df.columns and "gender_encoded" in df.columns:
    df_gpa_by_gender = df.groupBy("gender_encoded").agg(mean("gpa").alias("avg_gpa"))
    print("\n📊 Average GPA by Gender:")
    df_gpa_by_gender.show()


📊 Average GPA by Gender:
+--------------+------------------+
|gender_encoded|           avg_gpa|
+--------------+------------------+
| (1,[0],[1.0])|1.8942253102389857|
|     (1,[],[])| 1.918678894880205|
+--------------+------------------+



In [18]:
# Save cleaned dataset as CSV
cleaned_file_path = "/content/cleaned_dataset_spark.csv"
df.toPandas().to_csv(cleaned_file_path, index=False)

print("\n✅ Cleaned dataset saved as 'cleaned_dataset_spark.csv'!")


✅ Cleaned dataset saved as 'cleaned_dataset_spark.csv'!
