In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

---

## Sample Dataset

Let's create a sample dataset simulating bike rental data.

In [None]:
# Create sample bike rental dataset
np.random.seed(42)
n = 100

bikes = pd.DataFrame({
    'date': pd.date_range('2024-01-01', periods=n, freq='D'),
    'season': np.random.choice([1, 2, 3, 4], n),  # 1=spring, 2=summer, 3=fall, 4=winter
    'holiday': np.random.choice([0, 1], n, p=[0.9, 0.1]),
    'weekday': np.tile(range(7), n // 7 + 1)[:n],
    'weather': np.random.choice([1, 2, 3], n, p=[0.5, 0.35, 0.15]),
    'temp': np.random.uniform(0.2, 0.9, n).round(2),
    'humidity': np.random.uniform(0.3, 0.9, n).round(2),
    'casual': np.random.randint(10, 200, n),
    'registered': np.random.randint(100, 800, n)
})

# Add total count
bikes['count'] = bikes['casual'] + bikes['registered']

# Add some missing values for demonstration
bikes.loc[5, 'temp'] = np.nan
bikes.loc[15, 'humidity'] = np.nan
bikes.loc[25, 'casual'] = np.nan

print("Sample Bike Rental Dataset:")
bikes.head()

---

## 1. Viewing Data

In [None]:
# First n rows
print("First 3 rows:")
bikes.head(3)

In [None]:
# Last n rows
print("Last 3 rows:")
bikes.tail(3)

In [None]:
# Shape and size
print("Shape (rows, columns):", bikes.shape)
print("Total elements:", bikes.size)
print("Number of rows:", len(bikes))

In [None]:
# Column names and data types
print("Columns:")
print(bikes.columns.tolist())
print("\nData types:")
print(bikes.dtypes)

In [None]:
# Comprehensive info
bikes.info()

---

## 2. Descriptive Statistics

In [None]:
# Summary statistics for numeric columns
bikes.describe()

In [None]:
# Statistics for specific columns
print("Casual users statistics:")
print(bikes['casual'].describe())

In [None]:
# Individual statistics
print("Mean count:", bikes['count'].mean())
print("Median count:", bikes['count'].median())
print("Std count:", bikes['count'].std())
print("Min count:", bikes['count'].min())
print("Max count:", bikes['count'].max())

---

## 3. Unique Values

In [None]:
# Unique values in a column
print("Unique seasons:", bikes['season'].unique())
print("Number of unique seasons:", bikes['season'].nunique())

In [None]:
# Value counts
print("Season distribution:")
print(bikes['season'].value_counts())

In [None]:
# Value counts with percentages
print("Weather distribution (%):")
print(bikes['weather'].value_counts(normalize=True).round(2) * 100)

---

## 4. Handling Missing Values

In [None]:
# Check for missing values
print("Missing values per column:")
print(bikes.isnull().sum())

In [None]:
# Total missing values
print("Total missing:", bikes.isnull().sum().sum())

In [None]:
# Rows with any missing values
print("Rows with missing values:")
bikes[bikes.isnull().any(axis=1)]

In [None]:
# Fill missing values
bikes_filled = bikes.copy()

# Fill with mean
bikes_filled['temp'].fillna(bikes_filled['temp'].mean(), inplace=True)
bikes_filled['humidity'].fillna(bikes_filled['humidity'].mean(), inplace=True)

# Fill with median
bikes_filled['casual'].fillna(bikes_filled['casual'].median(), inplace=True)

print("After filling:")
print(bikes_filled.isnull().sum())

In [None]:
# Alternative: Drop rows with missing values
bikes_dropped = bikes.dropna()
print(f"Original rows: {len(bikes)}, After dropping: {len(bikes_dropped)}")

---

## 5. Handling Duplicates

In [None]:
# Check for duplicates
print("Number of duplicate rows:", bikes.duplicated().sum())

In [None]:
# Check duplicates based on specific columns
print("Duplicate dates:", bikes['date'].duplicated().sum())

In [None]:
# Remove duplicates (if any)
bikes_unique = bikes.drop_duplicates()
print(f"Rows after removing duplicates: {len(bikes_unique)}")

---

## 6. Data Transformation

In [None]:
# Extract features from datetime
bikes_filled['year'] = bikes_filled['date'].dt.year
bikes_filled['month'] = bikes_filled['date'].dt.month
bikes_filled['day'] = bikes_filled['date'].dt.day
bikes_filled['day_name'] = bikes_filled['date'].dt.day_name()

print("With date features:")
bikes_filled[['date', 'year', 'month', 'day', 'day_name']].head()

In [None]:
# Map season numbers to names
season_map = {1: 'Spring', 2: 'Summer', 3: 'Fall', 4: 'Winter'}
bikes_filled['season_name'] = bikes_filled['season'].map(season_map)

print("Season mapping:")
bikes_filled[['season', 'season_name']].head()

In [None]:
# Create new calculated column
bikes_filled['avg_per_user'] = bikes_filled['count'] / (bikes_filled['casual'] + bikes_filled['registered'])
bikes_filled['avg_per_user'].head()

---

## 7. Filtering Data

In [None]:
# Filter by single condition
summer_data = bikes_filled[bikes_filled['season'] == 2]
print(f"Summer records: {len(summer_data)}")
summer_data.head()

In [None]:
# Filter by multiple conditions
good_weather_busy = bikes_filled[
    (bikes_filled['weather'] == 1) & 
    (bikes_filled['count'] > 500)
]
print(f"Good weather & busy days: {len(good_weather_busy)}")
good_weather_busy.head()

In [None]:
# Filter holidays
holidays = bikes_filled[bikes_filled['holiday'] == 1]
print(f"Holidays: {len(holidays)}")

---

## 8. Group By Operations

In [None]:
# Group by single column
by_season = bikes_filled.groupby('season')

print("Total count by season:")
print(by_season['count'].sum())

In [None]:
# Multiple aggregations
print("Statistics by season:")
by_season['count'].agg(['sum', 'mean', 'min', 'max'])

In [None]:
# Group by multiple columns
by_season_weather = bikes_filled.groupby(['season', 'weather'])

print("Average count by season and weather:")
by_season_weather['count'].mean().unstack()

In [None]:
# Aggregations on multiple columns
season_summary = bikes_filled.groupby('season_name').agg({
    'casual': 'mean',
    'registered': 'mean',
    'count': ['sum', 'mean']
}).round(2)

print("Season summary:")
season_summary

---

## 9. Sorting Data

In [None]:
# Sort by single column
print("Top 5 busiest days:")
bikes_filled.sort_values('count', ascending=False).head()

In [None]:
# Sort by multiple columns
print("Sorted by season, then count:")
bikes_filled.sort_values(['season', 'count'], ascending=[True, False]).head()

---

## 10. Basic Plotting with Pandas

In [None]:
# Line plot - time series
plt.figure(figsize=(12, 4))
bikes_filled['count'].plot(kind='line')
plt.title('Daily Bike Rentals')
plt.xlabel('Day')
plt.ylabel('Count')
plt.show()

In [None]:
# Bar plot - category comparison
season_totals = bikes_filled.groupby('season_name')['count'].sum()

plt.figure(figsize=(8, 5))
season_totals.plot(kind='bar', color=['green', 'orange', 'brown', 'blue'])
plt.title('Total Rentals by Season')
plt.xlabel('Season')
plt.ylabel('Total Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Histogram - distribution
plt.figure(figsize=(8, 5))
bikes_filled['count'].plot(kind='hist', bins=20, edgecolor='black')
plt.title('Distribution of Daily Rentals')
plt.xlabel('Count')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Box plot - compare distributions
plt.figure(figsize=(8, 5))
bikes_filled.boxplot(column='count', by='season')
plt.title('Rental Distribution by Season')
plt.suptitle('')  # Remove automatic title
plt.xlabel('Season')
plt.ylabel('Count')
plt.show()

In [None]:
# Scatter plot - relationship between variables
plt.figure(figsize=(8, 5))
bikes_filled.plot(kind='scatter', x='temp', y='count', alpha=0.5)
plt.title('Temperature vs Rentals')
plt.xlabel('Temperature (normalized)')
plt.ylabel('Count')
plt.show()

In [None]:
# Pie chart - proportions
weather_counts = bikes_filled['weather'].value_counts()

plt.figure(figsize=(8, 6))
weather_counts.plot(kind='pie', autopct='%1.1f%%', labels=['Clear', 'Cloudy', 'Rain'])
plt.title('Weather Distribution')
plt.ylabel('')
plt.show()

---

## üìù Practice Problems

### Problem 1: Data Inspection
Using the `bikes_filled` DataFrame:
1. How many rows and columns are there?
2. What are the data types of each column?
3. What is the average temperature?

In [None]:
# Your solution here

### Problem 2: Filtering and Aggregation
1. Find all records where it's a holiday AND weather is good (weather == 1)
2. Calculate the average count for weekdays vs weekends
3. Find the day with the maximum number of casual users

In [None]:
# Your solution here

### Problem 3: Group By Analysis
1. Calculate total casual and registered users by season
2. Find the average count for each weekday (0=Sunday, 6=Saturday)
3. Which season has the highest average temperature?

In [None]:
# Your solution here

### Problem 4: Visualization
1. Create a bar chart showing average count by weekday
2. Create a scatter plot of humidity vs count
3. Create a line plot showing the 7-day rolling average of count

In [None]:
# Your solution here

---

## ‚úÖ Solutions

### Solution 1: Data Inspection

In [None]:
# Solution 1
print("1. Shape:", bikes_filled.shape)
print(f"   Rows: {bikes_filled.shape[0]}, Columns: {bikes_filled.shape[1]}")

print("\n2. Data types:")
print(bikes_filled.dtypes)

print(f"\n3. Average temperature: {bikes_filled['temp'].mean():.4f}")

### Solution 2: Filtering and Aggregation

In [None]:
# Solution 2

# 1. Holiday AND good weather
holiday_good_weather = bikes_filled[(bikes_filled['holiday'] == 1) & (bikes_filled['weather'] == 1)]
print(f"1. Holiday with good weather: {len(holiday_good_weather)} records")

# 2. Average count weekday vs weekend
bikes_filled['is_weekend'] = bikes_filled['weekday'].isin([0, 6])  # 0=Sunday, 6=Saturday
print("\n2. Average count:")
print(bikes_filled.groupby('is_weekend')['count'].mean().rename({False: 'Weekday', True: 'Weekend'}))

# 3. Day with maximum casual users
max_casual_idx = bikes_filled['casual'].idxmax()
print(f"\n3. Day with max casual users:")
print(bikes_filled.loc[max_casual_idx, ['date', 'casual']])

### Solution 3: Group By Analysis

In [None]:
# Solution 3

# 1. Total users by season
print("1. Total users by season:")
print(bikes_filled.groupby('season_name')[['casual', 'registered']].sum())

# 2. Average count by weekday
print("\n2. Average count by weekday:")
print(bikes_filled.groupby('weekday')['count'].mean().round(2))

# 3. Season with highest average temperature
temp_by_season = bikes_filled.groupby('season_name')['temp'].mean()
print(f"\n3. Warmest season: {temp_by_season.idxmax()} ({temp_by_season.max():.4f})")

### Solution 4: Visualization

In [None]:
# Solution 4

# 1. Bar chart - average count by weekday
plt.figure(figsize=(10, 4))
bikes_filled.groupby('weekday')['count'].mean().plot(kind='bar')
plt.title('Average Rentals by Weekday')
plt.xlabel('Weekday (0=Sun, 6=Sat)')
plt.ylabel('Average Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# 2. Scatter plot - humidity vs count
plt.figure(figsize=(8, 5))
plt.scatter(bikes_filled['humidity'], bikes_filled['count'], alpha=0.5)
plt.title('Humidity vs Rentals')
plt.xlabel('Humidity')
plt.ylabel('Count')
plt.show()

# 3. Line plot - 7-day rolling average
plt.figure(figsize=(12, 4))
bikes_filled['count'].rolling(window=7).mean().plot()
plt.title('7-Day Rolling Average of Rentals')
plt.xlabel('Day')
plt.ylabel('Average Count')
plt.show()

---

## üìå Summary

| Task | Function |
|------|----------|
| View data | `head()`, `tail()`, `info()` |
| Statistics | `describe()`, `mean()`, `std()` |
| Unique values | `unique()`, `nunique()`, `value_counts()` |
| Missing values | `isnull()`, `fillna()`, `dropna()` |
| Duplicates | `duplicated()`, `drop_duplicates()` |
| Group by | `groupby()`, `agg()` |
| Sort | `sort_values()`, `sort_index()` |
| Plot | `plot(kind='line/bar/hist/scatter')` |

**Next:** [28_matplotlib_basics.ipynb](28_matplotlib_basics.ipynb) - Introduction to Matplotlib