## Install & Import Libraries

In [None]:
!pip install pandas sqlalchemy matplotlib seaborn

import pandas as pd
import sqlite3
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns

# Load, Store, and Clean Data

In [None]:
data_path = 'vgsales.csv' 

# 1. Load data from CSV
try:
    df_raw = pd.read_csv(data_path)
    print("Raw data loaded successfully. Head:")
    print(df_raw.head())
    
    # 2. Store data in SQLite
    engine = create_engine('sqlite:///vgsales.db', echo=False)
    
    df_raw.to_sql('sales', con=engine, if_exists='replace', index=False)
    
    print("\nData successfully saved to 'vgsales.db'")

except Exception as e:
    print(f"An error occurred during loading/storing: {e}")


# 3. Load from database and clean
try:
    engine = create_engine('sqlite:///vgsales.db')
    
    df = pd.read_sql_table('sales', con=engine)
    
    # Drop rows where 'Year' is missing
    df_cleaned = df.dropna(subset=['Year'])

    # Correct data type for 'Year'
    df_cleaned['Year'] = df_cleaned['Year'].astype(int)

    print("\nCleaned data head:")
    print(df_cleaned.head())

except Exception as e:
    print(f"An error occurred during cleaning: {e}")

# Bar chart - Comparison (Genres)

To test our hypothesis that 'Action' and 'Sports' are the top-selling genres in Europe.

In [None]:
# Group the data by 'Genre' and sum the sales for Europe
genre_sales = df_cleaned.groupby('Genre')['EU_Sales'].sum().sort_values(ascending=False)

print("Top 5 Genres in Europe (Millions of Sales):")
print(genre_sales.head(5))

plt.figure(figsize=(12, 7))
sns.barplot(x=genre_sales.index, y=genre_sales.values, palette='viridis')

plt.title('Total European Sales by Genre')
plt.xlabel('Genre')
plt.ylabel('Total Sales (in millions)')
plt.show()

# Horizontal bar chart - Comparison (Platforms)

To test my hypothesis that PlayStation platforms are the most dominant in Europe.

In [None]:
# To group the data by 'Platform' and sum the sales for Europe
platform_sales = df_cleaned.groupby('Platform')['EU_Sales'].sum().sort_values(ascending=False)

# To display the top 5 as text
print("Top 5 Platforms in Europe (Millions of Sales):")
print(platform_sales.head(5))

plt.figure(figsize=(12, 8))
sns.barplot(x=platform_sales.head(15).values, y=platform_sales.head(15).index, palette='rocket')

plt.title('Top 15 Platforms by European Sales')
plt.xlabel('Total Sales (in millions)')
plt.ylabel('Platform')
plt.show()

# Line Plot Graph - Change Over Time

To analyse the trend of total video game sales in Europe over the years.

In [None]:
# Grouping the data by 'Year' and sum the sales
yearly_sales = df_cleaned.groupby('Year')['EU_Sales'].sum()
yearly_sales_filtered = yearly_sales[yearly_sales.index > 1990]

plt.figure(figsize=(12, 6))
sns.lineplot(x=yearly_sales_filtered.index, y=yearly_sales_filtered.values, marker='o')


plt.title('Total European Game Sales Per Year (Post-1990)')
plt.xlabel('Year')
plt.ylabel('Total Sales (in millions)')
plt.grid(True)
plt.show()

# Scatter Plot - Relationship

To analyse if a game's sales in Europe are related to its sales in North America.

In [None]:
# Took a random sample of 1000 games
df_sample = df_cleaned.sample(n=1000, random_state=1)

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_sample, x='NA_Sales', y='EU_Sales', alpha=0.5)

plt.title('Relationship: EU Sales vs. NA Sales (1000 random games)')
plt.xlabel('North American Sales (in millions)')
plt.ylabel('European Sales (in millions)')
plt.grid(True)
plt.show()

## Step 5: Conclusion & Analysis

My Hypothesis: In Europe, PlayStation platforms have been the most dominant. I also predict that the 'Sports' and 'Action' genres are the highest-selling genres in this region.


## Analysis of Results

Part 1: Genre Preference (Action & Sports)
    First bar chart ("Total European Sales by Genre") shows that 'Action' is the #1 highest-selling genre in Europe.
    'Sports' is the #2 highest-selling genre.
    Conclusion: hypothesis was correct.

Part 2: Platform Dominance (PlayStation)
     Second bar chart ("Top 15 Platforms") shows that the PS2 is the #1 best-selling platform in Europe.
     The PS3 is #2.
     The PC comes in at #3.
     The PS4 is #5.
     Conclusion: hypothesis was correct. PlayStation platforms (PS2, PS3, PS4) hold three of the top five spots and are the most dominant platforms in Europe.