# 🍔 Food Delivery Data Integration

This notebook integrates data from three different sources:
- **orders.csv** - Transactional order data
- **users.json** - User master data
- **restaurants.sql** - Restaurant master data

We'll merge these datasets and create a unified DataFrame for analysis.

## Step 1: Import Libraries

In [None]:
import pandas as pd
import numpy as np
import json
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("✅ Libraries imported successfully!")

## Step 2: Load CSV Data (Orders)

In [None]:
# Load orders data from CSV
orders_df = pd.read_csv('orders.csv')

print(f"📊 Orders DataFrame loaded!")
print(f"   Shape: {orders_df.shape}")
print(f"   Columns: {list(orders_df.columns)}")
print("\n🔍 First 5 rows:")
orders_df.head()

## Step 3: Load JSON Data (Users)

In [None]:
# Load users data from JSON
with open('users.json', 'r') as f:
    users_data = json.load(f)

# Convert to DataFrame
users_df = pd.DataFrame(users_data)

print(f"📊 Users DataFrame loaded!")
print(f"   Shape: {users_df.shape}")
print(f"   Columns: {list(users_df.columns)}")
print("\n🔍 First 5 rows:")
users_df.head()

## Step 4: Load SQL Data (Restaurants)

In [None]:
# Parse SQL INSERT statements to extract restaurant data
def parse_sql_file(filename):
    """Parse SQL INSERT statements and return a DataFrame"""
    with open(filename, 'r') as f:
        sql_content = f.read()
    
    # Extract all INSERT statements
    pattern = r"INSERT INTO restaurants VALUES \((\d+), '([^']+)', '([^']+)', ([\d.]+)\);"
    matches = re.findall(pattern, sql_content)
    
    # Create DataFrame
    data = []
    for match in matches:
        data.append({
            'restaurant_id': int(match[0]),
            'restaurant_name': match[1],
            'cuisine': match[2],
            'rating': float(match[3])
        })
    
    return pd.DataFrame(data)

# Load restaurants data
restaurants_df = parse_sql_file('restaurants.sql')

print(f"📊 Restaurants DataFrame loaded!")
print(f"   Shape: {restaurants_df.shape}")
print(f"   Columns: {list(restaurants_df.columns)}")
print("\n🔍 First 5 rows:")
restaurants_df.head()

## Step 5: Data Quality Check

In [None]:
print("📋 Data Quality Summary:\n")

print("Orders DataFrame:")
print(orders_df.info())
print(f"\nMissing values:\n{orders_df.isnull().sum()}\n")
print("="*50)

print("\nUsers DataFrame:")
print(users_df.info())
print(f"\nMissing values:\n{users_df.isnull().sum()}\n")
print("="*50)

print("\nRestaurants DataFrame:")
print(restaurants_df.info())
print(f"\nMissing values:\n{restaurants_df.isnull().sum()}")

## Step 6: Merge the Datasets

We'll perform **left joins** to retain all orders:
1. Merge orders with users on `user_id`
2. Merge result with restaurants on `restaurant_id`

In [None]:
# Step 1: Merge orders with users
print("🔗 Merging orders with users...")
merged_df = orders_df.merge(users_df, on='user_id', how='left')
print(f"   After user merge: {merged_df.shape}")

# Step 2: Merge with restaurants (using suffixes to distinguish duplicate columns)
print("🔗 Merging with restaurants...")
final_dataset = merged_df.merge(
    restaurants_df, 
    on='restaurant_id', 
    how='left',
    suffixes=('_order', '_restaurant')
)
print(f"   After restaurant merge: {final_dataset.shape}")

print("\n✅ Final dataset created successfully!")
print(f"\n📊 Final Dataset Shape: {final_dataset.shape}")
print(f"   Total Orders: {len(final_dataset)}")
print(f"   Total Columns: {len(final_dataset.columns)}")
print(f"\nColumns: {list(final_dataset.columns)}")
print("\n💡 Note: 'restaurant_name_order' is from orders.csv, 'restaurant_name_restaurant' is from restaurants.sql")

## Step 7: Preview Final Dataset

In [None]:
# Display first 10 rows
final_dataset.head(10)

In [None]:
# Display dataset info
final_dataset.info()

In [None]:
# Statistical summary
final_dataset.describe()

## Step 8: Save Final Dataset (Optional)

In [None]:
# Uncomment to save the final dataset to CSV
# final_dataset.to_csv('final_food_delivery_dataset.csv', index=False)
# print("✅ Final dataset saved to 'final_food_delivery_dataset.csv'")

---

## 🎯 Your Analysis Here

The `final_dataset` DataFrame is ready for your exploratory data analysis!

**Happy Analyzing! 🚀**

## Analysis: Which city has the highest revenue from Gold members?

In [None]:
# Filter for Gold members only
gold_members = final_dataset[final_dataset['membership'] == 'Gold']

# Calculate total revenue by city for Gold members
revenue_by_city = gold_members.groupby('city')['total_amount'].sum().sort_values(ascending=False)

print("=" * 60)
print("TOTAL REVENUE BY CITY (GOLD MEMBERS ONLY)")
print("=" * 60)
print(revenue_by_city)
print("\n" + "=" * 60)
print(f"ANSWER: {revenue_by_city.idxmax()} has the highest revenue")
print(f"Revenue: Rs. {revenue_by_city.max():,.2f}")
print("=" * 60)

In [None]:
# Your custom analysis here
