
# Setup: Import Libraries and Suppress Warnings



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import warnings

# Suppress all warnings for cleaner output
warnings.filterwarnings('ignore')

# You might need to download the chinook.db and student-mat/por.csv files if not in your environment.
# For Google Colab, you might upload them or download via specific links.
# Example for Colab if files are on GitHub (adjust URLs as needed):
# !wget https://raw.githubusercontent.com/lerocha/chinook-database/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite -O chinook.db
# !wget https://raw.githubusercontent.com/gabrielaproenca/Student-Performance-Data-Set/master/student-mat.csv
# !wget https://raw.githubusercontent.com/gabrielaproenca/Student-Performance-Data-Set/master/student-por.csv
# !wget https://raw.githubusercontent.com/IBM/telco-customer-churn-extra-data/master/WA_Fn-UseC_-Telco-Customer-Churn.csv

# Global dataframes that will be used across parts
titanic_df = None
telco_df = None
conn = None # For SQLite connection
merged_students_df = None



## Part 1 Solutions: Data Loading, Inspection, and Cleaning

### Task 1.1 Solution: Load and Initial Inspection (Titanic Dataset)



In [None]:

# Load the Titanic dataset

titanic_df = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')



In [None]:
# Display the first 5 rows
# Display the first 5 rows
print("### First 5 rows of Titanic DataFrame:")
print(titanic_df.head())
print("\n" + "#" * 50 + "\n")
 # Separator for clarity
 # Separator for clarity



### First 5 rows of Titanic DataFrame:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0        

In [None]:
# Get a concise summary of the DataFrame
print("### DataFrame Information:")
titanic_df.info()
print(" " + "#" * 50 + " ")

# Display descriptive statistics for numerical columns
# Get a concise summary of the DataFrame
print("### DataFrame Information:")
titanic_df.info()
print(" " + "#" * 50 + " ")

# Display descriptive statistics for numerical columns
# Get a concise summary of the DataFrame
print("### DataFrame Information:")
titanic_df.info()
print(" " + "#" * 50 + " ")

# Display descriptive statistics for numerical columns
# Get a concise summary of the DataFrame
print("### DataFrame Information:")
titanic_df.info()
print(" " + "#" * 50 + " ")

# Display descriptive statistics for numerical columns
print("### Descriptive Statistics for Numerical Columns:")
print(titanic_df.describe())
print("\n" + "#" * 50 + "\n")


### DataFrame Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
 ################################################## 
### DataFrame Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count


### Task 1.2 Solution: Handling Missing Values (Titanic Dataset)



In [None]:
# Identify the number of missing values in each column
print("### Missing Values Before Cleaning:")
print(titanic_df.isnull().sum())
print(" " + "#" * 50 + " ")
# Separator for clarity

# Determine the percentage of missing values for Age and Cabin columns
missing_age_percent = (titanic_df['Age'].isnull().sum() / len(titanic_df)) * 100
missing_cabin_percent = (titanic_df['Cabin'].isnull().sum() / len(titanic_df)) * 100
# Identify the number of missing values in each column
print("### Missing Values Before Cleaning:")
print(titanic_df.isnull().sum())
print(" " + "#" * 50 + " ")
# Separator for clarity

# Determine the percentage of missing values for Age and Cabin columns
missing_age_percent = (titanic_df['Age'].isnull().sum() / len(titanic_df)) * 100
missing_cabin_percent = (titanic_df['Cabin'].isnull().sum() / len(titanic_df)) * 100

print(f"Percentage of missing 'Age' values: {missing_age_percent:.2f}%")
print(f"Percentage of missing 'Cabin' values: {missing_cabin_percent:.2f}%")

#This line of Python code uses an f-string (formatted string literal) to print a
#message that includes a calculated percentage, formatted to two decimal places.


#This is an f-string. Introduced in Python 3.6, f-strings provide a concise and
#readable way to embed Python expressions inside string literals. The f prefix
#before the opening quotation mark (") indicates that it's an f-string.

# {missing_age_percent:.2f}:
# This is the core of the f-string's power: an expression placeholder.
#:: This colon introduces the format specifier.
#.2f: This is the format specifier itself:
#.2: Specifies that the number should be displayed with two digits after the decimal point.
#f: Indicates that the number should be formatted as a fixed-point number (i.e., a floating-point number in decimal format).


### Missing Values Before Cleaning:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
 ################################################## 
### Missing Values Before Cleaning:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
 ################################################## 
Percentage of missing 'Age' values: 19.87%
Percentage of missing 'Cabin' values: 77.10%


In [None]:
# Fill missing 'Age' values with the median age
median_age = titanic_df['Age'].median()
titanic_df['Age'].fillna(median_age, inplace=True)
print(f"Filled missing 'Age' values with median: {median_age}")



Filled missing 'Age' values with median: 28.0


In [None]:
# Drop the 'Cabin' column due to a high percentage of missing values
titanic_df.drop('Cabin', axis=1, inplace=True)
print("Dropped 'Cabin' column.")

# Fill missing 'Embarked' values with the most frequent port
most_frequent_embarked = titanic_df['Embarked'].mode()[0]
titanic_df['Embarked'].fillna(most_frequent_embarked, inplace=True)
print(f"Filled missing 'Embarked' values with most frequent: {most_frequent_embarked}")
print(" " + "#" * 50 + " ")

# Verify that no missing values remain
print("### Missing Values After Cleaning:")
print(titanic_df.isnull().sum())
print(" " + "#" * 50 + " ")



Dropped 'Cabin' column.
Filled missing 'Embarked' values with most frequent: S
 ################################################## 
### Missing Values After Cleaning:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64
 ################################################## 



### Task 1.3 Solution: Data Type Conversion & Basic Feature Engineering (Telco Customer Churn)



In [None]:
# Load the Telco Customer Churn dataset
telco_df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

print("### Initial Info for Telco DataFrame:")
telco_df.info()
print(" " + "❤COMSATS" * 10 + " ")

# Inspect 'TotalCharges' data type and convert
print("Unique values in 'TotalCharges' before conversion (first few):", telco_df['TotalCharges'].unique()[:10])

# Convert 'TotalCharges' to numeric, coercing errors to NaN
telco_df['TotalCharges'] = pd.to_numeric(telco_df['TotalCharges'], errors='coerce')

# Fill any NaNs created by conversion (e.g., from empty strings) with 0 or median
telco_df['TotalCharges'].fillna(0, inplace=True)

print(" ### Info for Telco DataFrame after TotalCharges conversion:")
telco_df.info()
print(" " + "#" * 50 + " ")

# Create a new column 'HasInternetService'
telco_df['HasInternetService'] = telco_df['InternetService'].apply(lambda x: 1 if x in ['DSL', 'Fiber optic'] else 0)

print(" ### Head of Telco DataFrame with new 'HasInternetService' column:")
print(telco_df[['InternetService', 'HasInternetService']].head())
print(" " + "❤COMSATS" * 10 + " ")


### Initial Info for Telco DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  Pape


## Part 2 Solutions: SQL with Python

### Task 2.1 Solution: Connect and Query



In [None]:
# Connect to the Chinook database
#conn = sqlite3.connect('chinook.db')
#cursor = conn.cursor()



In [None]:
!wget https://raw.githubusercontent.com/lerocha/chinook-database/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite -O chinook.db


--2025-06-12 00:30:02--  https://raw.githubusercontent.com/lerocha/chinook-database/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1067008 (1.0M) [application/octet-stream]
Saving to: ‘chinook.db’


2025-06-12 00:30:03 (23.4 MB/s) - ‘chinook.db’ saved [1067008/1067008]



In [None]:
# SQL Query to select employee first and last names
sql_query_employees = "SELECT FirstName, LastName FROM Employee;"



In [None]:
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print(tables)


[('Album',), ('Artist',), ('Customer',), ('Employee',), ('Genre',), ('Invoice',), ('InvoiceLine',), ('MediaType',), ('Playlist',), ('PlaylistTrack',), ('Track',)]


In [None]:

# Execute the query
cursor.execute(sql_query_employees)

<sqlite3.Cursor at 0x7ca1b64f53c0>

In [None]:
# Fetch all results
employee_results = cursor.fetchall()

# Print the results
print("### Employee First and Last Names:")
for row in employee_results:
    print(row)
print(" " + "#" * 50 + " ")



### Employee First and Last Names:
('Andrew', 'Adams')
('Nancy', 'Edwards')
('Jane', 'Peacock')
('Margaret', 'Park')
('Steve', 'Johnson')
('Michael', 'Mitchell')
('Robert', 'King')
('Laura', 'Callahan')
 ################################################## 


##2.2 Data with SQL and Analyze with Pandas


In [None]:
# Task 2.2 Solution: Aggregate Data with SQL and Analyze with Pandas
# SQL Query to count customers per country
sql_query_Customer_by_Country = "SELECT Country, COUNT(CustomerId) AS NumberOfCustomers FROM Customer GROUP BY Country ORDER BY NumberOfCustomers DESC;"

# Load the results into a Pandas DataFrame using pd.read_sql_query
customers_by_country_df = pd.read_sql_query(sql_query_Customer_by_Country, conn)

print("### Customers by Country:")
print(customers_by_country_df)
print("\n" + "#" * 50 + "\n") # Separator for clarity




### Customers by Country:
           Country  NumberOfCustomers
0              USA                 13
1           Canada                  8
2           France                  5
3           Brazil                  5
4          Germany                  4
5   United Kingdom                  3
6         Portugal                  2
7            India                  2
8   Czech Republic                  2
9           Sweden                  1
10           Spain                  1
11          Poland                  1
12          Norway                  1
13     Netherlands                  1
14           Italy                  1
15         Ireland                  1
16         Hungary                  1
17         Finland                  1
18         Denmark                  1
19           Chile                  1
20         Belgium                  1
21         Austria                  1
22       Australia                  1
23       Argentina                  1

#######################

In [None]:
# Using Pandas, find the country with the most customers
most_customers_country = customers_by_country_df.loc[customers_by_country_df['NumberOfCustomers'].idxmax()]

print("### Country with the Most Customers:")
print(most_customers_country)
print("\n" + "#" * 50 + "\n") # Separator for clarity

### Country with the Most Customers:
Country              USA
NumberOfCustomers     13
Name: 0, dtype: object

##################################################



## Task 2.3 Solution: Joining Tables and Data Manipulation

In [None]:
# Task 2.3 Solution: Joining Tables and Data Manipulation
# SQL Query to join invoices and invoice_items
sql_query_invoice_details = """
SELECT
    i.Invoice,
    i.BillingCountry,
    i.Total,
    ii.UnitPrice,
    ii.Quantity
FROM
    invoices AS i
INNER JOIN
    invoice_items AS ii ON i.InvoiceId = ii.InvoiceId;
"""

In [None]:
# Load the joined data into a Pandas DataFrame
#invoice_details_df = pd.read_sql_query(sql_query_invoices, conn)
invoice_item_df = pd.read_sql_query(sql_query_invoice_details, conn)


DatabaseError: Execution failed on sql '
SELECT
    i.Invoice,
    i.BillingCountry,
    i.Total,
    ii.UnitPrice,
    ii.Quantity
FROM
    invoices AS i
INNER JOIN
    invoice_items AS ii ON i.InvoiceId = ii.InvoiceId;
': no such table: invoices

In [None]:
print("### Head of Joined Invoice Details DataFrame:")
print(invoice_line_df.head())
print("\n" + "#" * 50 + "\n") # Separator for clarity



### Head of Joined Invoice Details DataFrame:


NameError: name 'invoice_line_df' is not defined

In [None]:
# Calculate the LineTotal for each invoice item
invoice_details_df['LineTotal'] = invoice_details_df['UnitPrice'] * invoice_details_df['Quantity']

print("### Head of Invoice Details with LineTotal:")
print(invoice_details_df.head())
print("\n" + "#" * 50 + "\n") # Separator for clarity




NameError: name 'invoice_details_df' is not defined

In [None]:
# Find the BillingCountry with the highest total LineTotal
total_revenue_by_country = invoice_details_df.groupby('BillingCountry')['LineTotal'].sum().reset_index()
highest_revenue_country = total_revenue_by_country.loc[total_revenue_by_country['LineTotal'].idxmax()]

print("### Country with the Highest Total Revenue (LineTotal):")
print(highest_revenue_country)
print("\n" + "#" * 50 + "\n") # Separator for clarity


NameError: name 'invoice_details_df' is not defined

##Task 2.4 Solution: Analyze and Visualize Top Music Genres (Integrated Task)

In [None]:
# Task 2.4 Solution: Analyze and Visualize Top Music Genres (Integrated Task)
# SQL Query to calculate total revenue per genre
sql_query_genre_revenue = """
SELECT
    g.Name AS GenreName,
    SUM(ii.UnitPrice * ii.Quantity) AS TotalRevenue
FROM
    genres AS g
INNER JOIN
    tracks AS t ON g.GenreId = t.GenreId
INNER JOIN
    invoice_items AS ii ON t.TrackId = ii.TrackId
GROUP BY
    g.Name
ORDER BY
    TotalRevenue DESC;
"""

# Load the results into a Pandas DataFrame
genre_revenue_df = pd.read_sql_query(sql_query_genre_revenue, conn)

print("### Total Revenue per Music Genre:")
print(genre_revenue_df.head(10)) # Display top 10 genres
print("\n" + "#" * 50 + "\n") # Separator for clarity

# Create a bar chart for the top 5 genres by total revenue
plt.figure(figsize=(12, 7))
sns.barplot(x='TotalRevenue', y='GenreName', data=genre_revenue_df.head(5), palette='viridis')
plt.title('Top 5 Music Genres by Total Revenue')
plt.xlabel('Total Revenue ($)')
plt.ylabel('Genre Name')
plt.show()
print("\n" + "#" * 50 + "\n") # Separator for clarity

# Close the database connection after all SQL tasks in Part 2.
conn.close()
print("Chinook database connection closed.")


# Part 3 Solutions: Data Merging and Advanced Visualization

Task 3.1 Solution: Merging Datasets (Student Performance Data)

In [None]:
# Task 3.1 Solution: Merging Datasets (Student Performance Data)
# Load the student-mat.csv and student-por.csv datasets
# Note: These CSVs are semicolon-separated, so specify the delimiter
try:
    student_mat_df = pd.read_csv("student-mat.csv", sep=';')
    student_por_df = pd.read_csv("student-por.csv", sep=';')
except FileNotFoundError:
    print("Ensure 'student-mat.csv' and 'student-por.csv' are in the correct directory or downloaded via wget commands in setup.")
    # For a real lab, ensure these files are provided or easily accessible.
    # To prevent errors during automated testing, we might want to create dummy files or raise a more specific error.
    raise FileNotFoundError("Student performance CSVs not found. Please download them.")

print("### Head of Math Students DataFrame:")
print(student_mat_df.head())
print("\n" + "#" * 50 + "\n") # Separator for clarity

print("### Head of Portuguese Students DataFrame:")
print(student_por_df.head())
print("\n" + "#" * 50 + "\n") # Separator for clarity

# Define common columns for merging (excluding unique final grades G1, G2, G3 which differ per course)
common_columns = [
    'school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
    'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures',
    'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet',
    'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences'
]

# Merge the two DataFrames on common columns using an inner join
# Suffixes are added to distinguish grades from different courses (G1_math, G3_por etc.)
merged_students_df = pd.merge(student_mat_df, student_por_df, on=common_columns, how='inner',
                              suffixes=('_math', '_por'))

print("### Shape of Merged Students DataFrame (Common Students who took both courses):")
print(merged_students_df.shape)
print("\n" + "#" * 50 + "\n") # Separator for clarity

print("### Head of Merged Students DataFrame (showing some common and specific columns):\n")
# Display relevant columns to confirm merge and suffixes
print(merged_students_df[['school', 'sex', 'age', 'G1_math', 'G2_math', 'G3_math', 'G1_por', 'G2_por', 'G3_por']].head())
print("\n" + "#" * 50 + "\n") # Separator for clarity

# The 'merged_students_df' is now ready for use in Task 3.4.


**Task 3.2 Solution: Univariate and Bivariate Visualization (Titanic Dataset - Using Cleaned Data from Part 1)**

In [None]:
# Task 3.2 Solution: Univariate and Bivariate Visualization (Titanic Dataset - Using Cleaned Data from Part 1)
# Set a style for Seaborn plots for better aesthetics
sns.set_style('whitegrid')

# Create a histogram of Age
plt.figure(figsize=(10, 6))
sns.histplot(titanic_df['Age'], bins=20, kde=True)
plt.title('Distribution of Passenger Age on Titanic')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()
print("\n" + "#" * 50 + "\n") # Separator for clarity

# Create a bar chart showing the count of Survived
plt.figure(figsize=(7, 5))
sns.countplot(x='Survived', data=titanic_df, palette='pastel')
plt.title('Survival Count on Titanic (0=No, 1=Yes)')
plt.xlabel('Survived')
plt.ylabel('Number of Passengers')
plt.xticks(ticks=[0, 1], labels=['Did Not Survive', 'Survived'])
plt.show()
print("\n" + "#" * 50 + "\n") # Separator for clarity

# Create a box plot of Fare across different Pclass
plt.figure(figsize=(10, 6))
sns.boxplot(x='Pclass', y='Fare', data=titanic_df, palette='viridis')
plt.title('Fare Distribution Across Passenger Classes')
plt.xlabel('Passenger Class')
plt.ylabel('Fare')
plt.show()
print("\n" + "#" * 50 + "\n") # Separator for clarity

# Create a count plot to visualize the relationship between Sex, Survived, and Pclass
# Use a catplot to easily facet by 'Sex'
g = sns.catplot(x='Pclass', hue='Survived', col='Sex', data=titanic_df, kind='count', height=5, aspect=1.2, palette='Set2')
g.set_axis_labels("Passenger Class", "Number of Passengers")
g.set_titles("Sex: {col_name}")
plt.suptitle('Survival Count by Passenger Class and Sex', y=1.02) # Adjust suptitle position for catplot
plt.show()
print("\n" + "#" * 50 + "\n") # Separator for clarity


## Task 3.3 Solution: Churn Analysis with Engineered Feature (Telco Customer Churn - Using Engineered Feature from Part 1)

In [None]:
# Task 3.3 Solution: Churn Analysis with Engineered Feature (Telco Customer Churn - Using Engineered Feature from Part 1)
# The 'telco_df' should already contain the 'HasInternetService' column from Task 1.3.

plt.figure(figsize=(8, 6))
sns.countplot(x='HasInternetService', hue='Churn', data=telco_df, palette='coolwarm')
plt.title('Churn Rate by Internet Service Availability')
plt.xlabel('Has Internet Service (0=No, 1=Yes)')
plt.ylabel('Number of Customers')
plt.xticks(ticks=[0, 1], labels=['No Internet Service', 'Has Internet Service'])
plt.legend(title='Churn', labels=['No Churn', 'Churn'])
plt.show()
print("\n" + "#" * 50 + "\n") # Separator for clarity

# Optionally, calculate churn percentages for deeper insights
churn_by_internet = telco_df.groupby('HasInternetService')['Churn'].value_counts(normalize=True).unstack()
print("### Churn Percentage by Internet Service:")
print(churn_by_internet)
print("\n" + "#" * 50 + "\n") # Separator for clarity


# Task 3.4 Solution: Comparative Analysis of Student Performance (Integrated Task)

In [None]:
# Task 3.4 Solution: Comparative Analysis of Student Performance (Integrated Task)
# The 'merged_students_df' should already be created in Task 3.1.

# Calculate the average final grade (G3) for math and Portuguese courses
avg_g3_math = merged_students_df['G3_math'].mean()
avg_g3_por = merged_students_df['G3_por'].mean()

print(f"Average final grade in Mathematics (G3_math): {avg_g3_math:.2f}")
print(f"Average final grade in Portuguese (G3_por): {avg_g3_por:.2f}")
print("\n" + "#" * 50 + "\n") # Separator for clarity

# Prepare data for plotting
# Create a DataFrame suitable for comparison
grade_comparison_df = pd.DataFrame({
    'Course': ['Mathematics', 'Portuguese'],
    'Average G3': [avg_g3_math, avg_g3_por]
})

# Create a bar chart to compare average final grades
plt.figure(figsize=(8, 6))
sns.barplot(x='Course', y='Average G3', data=grade_comparison_df, palette='Paired')
plt.title('Comparison of Average Final Grades (G3) for Students in Both Courses')
plt.xlabel('Course')
plt.ylabel('Average Final Grade (G3)')
plt.ylim(0, 20) # Grades are typically 0-20
plt.show()
print("\n" + "#" * 50 + "\n") # Separator for clarity

# Optional: Box plot to compare grade distributions (more detailed than just average)
plt.figure(figsize=(10, 6))
sns.boxplot(data=merged_students_df[['G3_math', 'G3_por']], palette='light:b')
plt.title('Distribution of Final Grades (G3) for Students in Both Courses')
plt.xlabel('Course')
plt.ylabel('Final Grade (G3)')
plt.xticks(ticks=[0, 1], labels=['Mathematics (G3)', 'Portuguese (G3)'])
plt.show()
print("\n" + "#" * 50 + "\n") # Separator for clarity
