# Uber Dataset Analysis - Quick Start Guide

This notebook demonstrates basic analysis techniques using the Uber datasets.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Load Datasets

In [None]:
# Load all datasets
rides = pd.read_csv('../data/raw/uber_rides.csv')
drivers = pd.read_csv('../data/raw/drivers.csv')
locations = pd.read_csv('../data/raw/locations.csv')
hourly_stats = pd.read_csv('../data/raw/hourly_stats.csv')
fare_details = pd.read_csv('../data/raw/fare_details.csv')

print("Datasets loaded successfully!")
print(f"Total rides: {len(rides)}")
print(f"Total drivers: {len(drivers)}")
print(f"Total locations: {len(locations)}")

## 3. Explore Rides Dataset

In [None]:
# Display first few rows
rides.head()

In [None]:
# Basic statistics
rides.describe()

In [None]:
# Check for missing values
rides.isnull().sum()

## 4. Basic Analysis

### 4.1 Average Fare by Payment Type

In [None]:
avg_fare_by_payment = rides.groupby('payment_type')['fare_amount'].mean()
print(avg_fare_by_payment)

# Visualize
avg_fare_by_payment.plot(kind='bar', color='skyblue')
plt.title('Average Fare by Payment Type')
plt.ylabel('Average Fare (USD)')
plt.xlabel('Payment Type')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### 4.2 Trip Duration Distribution

In [None]:
plt.hist(rides['trip_duration_minutes'], bins=20, color='lightgreen', edgecolor='black')
plt.title('Distribution of Trip Duration')
plt.xlabel('Duration (minutes)')
plt.ylabel('Frequency')
plt.axvline(rides['trip_duration_minutes'].mean(), color='red', linestyle='--', label='Mean')
plt.legend()
plt.show()

### 4.3 Passenger Count Analysis

In [None]:
passenger_counts = rides['passenger_count'].value_counts().sort_index()
print(passenger_counts)

passenger_counts.plot(kind='bar', color='coral')
plt.title('Distribution of Passenger Count')
plt.xlabel('Number of Passengers')
plt.ylabel('Number of Rides')
plt.tight_layout()
plt.show()

## 5. Driver Analysis

In [None]:
# Top rated drivers
top_drivers = drivers.nlargest(5, 'rating')[['name', 'rating', 'total_rides', 'years_experience']]
print("Top 5 Rated Drivers:")
print(top_drivers)

In [None]:
# Rating vs Experience
plt.scatter(drivers['years_experience'], drivers['rating'], alpha=0.6, s=100)
plt.title('Driver Rating vs Years of Experience')
plt.xlabel('Years of Experience')
plt.ylabel('Rating')
plt.grid(True, alpha=0.3)
plt.show()

## 6. Time-based Analysis

In [None]:
# Hourly ride distribution
hourly_stats['datetime'] = pd.to_datetime(hourly_stats['date'] + ' ' + hourly_stats['hour'].astype(str) + ':00:00')

plt.figure(figsize=(15, 6))
plt.plot(hourly_stats['datetime'], hourly_stats['total_rides'], marker='o', linewidth=2)
plt.title('Hourly Ride Volume Over Time')
plt.xlabel('Date and Hour')
plt.ylabel('Number of Rides')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Fare Analysis

In [None]:
# Merge rides with fare details
rides_with_fares = rides.merge(fare_details, on='ride_id')

# Fare components breakdown
fare_components = fare_details[['base_fare', 'distance_fare', 'time_fare', 'tip_amount', 'taxes_fees']].mean()

plt.figure(figsize=(10, 6))
fare_components.plot(kind='bar', color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8'])
plt.title('Average Fare Components')
plt.ylabel('Amount (USD)')
plt.xlabel('Fare Component')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 8. Surge Pricing Analysis

In [None]:
surge_stats = fare_details.groupby('surge_multiplier').agg({
    'ride_id': 'count',
    'total_fare': 'mean'
}).rename(columns={'ride_id': 'num_rides', 'total_fare': 'avg_fare'})

print("Surge Pricing Statistics:")
print(surge_stats)

## 9. Geographic Analysis

In [None]:
# Popular locations
print("Popular Pickup Locations:")
print(locations[locations['popular_pickup'] == True][['location_name', 'borough', 'zone']])

print("\nPopular Dropoff Locations:")
print(locations[locations['popular_dropoff'] == True][['location_name', 'borough', 'zone']])

## 10. Comprehensive Dashboard

In [None]:
# Create a comprehensive view
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Plot 1: Fare distribution
axes[0, 0].hist(rides['fare_amount'], bins=15, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Fare Distribution')
axes[0, 0].set_xlabel('Fare (USD)')
axes[0, 0].set_ylabel('Frequency')

# Plot 2: Trip duration vs fare
axes[0, 1].scatter(rides['trip_duration_minutes'], rides['fare_amount'], alpha=0.6)
axes[0, 1].set_title('Trip Duration vs Fare')
axes[0, 1].set_xlabel('Duration (minutes)')
axes[0, 1].set_ylabel('Fare (USD)')

# Plot 3: Payment type distribution
rides['payment_type'].value_counts().plot(kind='pie', ax=axes[1, 0], autopct='%1.1f%%')
axes[1, 0].set_title('Payment Type Distribution')
axes[1, 0].set_ylabel('')

# Plot 4: Rides by date
rides_by_date = rides.groupby('date').size()
rides_by_date.plot(kind='bar', ax=axes[1, 1], color='lightgreen')
axes[1, 1].set_title('Rides by Date')
axes[1, 1].set_xlabel('Date')
axes[1, 1].set_ylabel('Number of Rides')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## Summary Statistics

In [None]:
print("="*60)
print("UBER DATASET SUMMARY STATISTICS")
print("="*60)
print(f"\nTotal Rides: {len(rides)}")
print(f"Total Drivers: {len(drivers)}")
print(f"Average Fare: ${rides['fare_amount'].mean():.2f}")
print(f"Average Trip Duration: {rides['trip_duration_minutes'].mean():.2f} minutes")
print(f"Average Passenger Count: {rides['passenger_count'].mean():.2f}")
print(f"\nPayment Methods:")
print(rides['payment_type'].value_counts())
print(f"\nDriver Average Rating: {drivers['rating'].mean():.2f}")
print(f"Most Experienced Driver: {drivers['years_experience'].max()} years")
print("="*60)