In [None]:
import pandas as pd
import numpy as np

---

## 1. Reading CSV Files

CSV (Comma-Separated Values) is the most common format for tabular data.

### 1.1 Basic CSV Reading

In [None]:
# First, let's create a sample CSV file to work with
sample_data = pd.DataFrame({
    'id': [1, 2, 3, 4, 5],
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'age': [25, 30, 35, 28, 22],
    'salary': [50000, 60000, 75000, 55000, 45000],
    'department': ['HR', 'IT', 'Finance', 'IT', 'HR']
})
sample_data.to_csv('employees.csv', index=False)
print("Sample CSV created: employees.csv")

In [None]:
# Read CSV file
df = pd.read_csv('employees.csv')
print("DataFrame from CSV:")
df

In [None]:
# View basic info about the DataFrame
print("Shape:", df.shape)
print("\nColumns:", list(df.columns))
print("\nData types:")
print(df.dtypes)

### 1.2 Common read_csv() Parameters

In [None]:
# Create a more complex CSV for demonstration
with open('complex_data.csv', 'w') as f:
    f.write("# This is a comment line\n")
    f.write("id;name;score;date\n")
    f.write("1;Alice;95.5;2024-01-15\n")
    f.write("2;Bob;87.3;2024-01-16\n")
    f.write("3;Charlie;NA;2024-01-17\n")
    f.write("4;Diana;92.1;2024-01-18\n")
print("Complex CSV created")

In [None]:
# Read with various parameters
df = pd.read_csv(
    'complex_data.csv',
    sep=';',                    # Separator (delimiter)
    comment='#',                # Skip lines starting with #
    na_values=['NA', 'N/A'],    # Values to treat as NaN
    parse_dates=['date']        # Parse date column
)
print("DataFrame with parameters:")
print(df)
print("\nData types:")
print(df.dtypes)

In [None]:
# Read only specific columns
df_subset = pd.read_csv('employees.csv', usecols=['name', 'salary'])
print("Only specific columns:")
df_subset

In [None]:
# Read first n rows (useful for large files)
df_head = pd.read_csv('employees.csv', nrows=3)
print("First 3 rows only:")
df_head

In [None]:
# Use a column as index
df_indexed = pd.read_csv('employees.csv', index_col='id')
print("With 'id' as index:")
df_indexed

---

## 2. Writing CSV Files

In [None]:
# Create sample DataFrame
df = pd.DataFrame({
    'product': ['Apple', 'Banana', 'Orange'],
    'price': [1.50, 0.75, 1.25],
    'quantity': [100, 150, 80]
})
df

In [None]:
# Basic write (includes index by default)
df.to_csv('products_with_index.csv')
print("With index:")
print(open('products_with_index.csv').read())

In [None]:
# Write without index (most common)
df.to_csv('products.csv', index=False)
print("Without index:")
print(open('products.csv').read())

In [None]:
# Write with custom separator
df.to_csv('products_semicolon.csv', index=False, sep=';')
print("With semicolon separator:")
print(open('products_semicolon.csv').read())

---

## 3. Reading Excel Files

Pandas can read Excel files (.xlsx, .xls) using `openpyxl` or `xlrd` libraries.

In [None]:
# Create sample Excel file
df_excel = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'math': [90, 85, 88],
    'english': [88, 92, 85]
})

# Note: Requires openpyxl package
try:
    df_excel.to_excel('students.xlsx', sheet_name='Grades', index=False)
    print("Excel file created: students.xlsx")
except ModuleNotFoundError:
    print("Install openpyxl: pip install openpyxl")

In [None]:
# Read Excel file
try:
    df = pd.read_excel('students.xlsx', sheet_name='Grades')
    print("DataFrame from Excel:")
    print(df)
except FileNotFoundError:
    print("Excel file not found")
except ModuleNotFoundError:
    print("Install openpyxl: pip install openpyxl")

### 3.1 Reading Multiple Sheets

In [None]:
# Create Excel file with multiple sheets
try:
    with pd.ExcelWriter('multi_sheet.xlsx') as writer:
        df_excel.to_excel(writer, sheet_name='Sheet1', index=False)
        df_excel.to_excel(writer, sheet_name='Sheet2', index=False)
    print("Multi-sheet Excel created")
except ModuleNotFoundError:
    print("Install openpyxl: pip install openpyxl")

In [None]:
# Read all sheets into a dictionary
try:
    all_sheets = pd.read_excel('multi_sheet.xlsx', sheet_name=None)
    print("Sheet names:", list(all_sheets.keys()))
    print("\nSheet1:")
    print(all_sheets['Sheet1'])
except FileNotFoundError:
    print("Excel file not found")
except ModuleNotFoundError:
    print("Install openpyxl: pip install openpyxl")

---

## 4. Reading JSON Files

In [None]:
# Create JSON file
import json

data = [
    {"name": "Alice", "age": 25, "city": "NYC"},
    {"name": "Bob", "age": 30, "city": "LA"},
    {"name": "Charlie", "age": 35, "city": "Chicago"}
]

with open('people.json', 'w') as f:
    json.dump(data, f)
print("JSON file created")

In [None]:
# Read JSON file
df_json = pd.read_json('people.json')
print("DataFrame from JSON:")
df_json

In [None]:
# Write DataFrame to JSON
df_json.to_json('people_output.json', orient='records', indent=2)
print("JSON output:")
print(open('people_output.json').read())

---

## 5. Reading from URLs

In [None]:
# Pandas can read directly from URLs
# Example (commented out to avoid network calls):

# url = 'https://example.com/data.csv'
# df = pd.read_csv(url)

# This works for CSV, JSON, Excel, etc.
print("You can read directly from URLs:")
print("df = pd.read_csv('https://example.com/data.csv')")

---

## 6. Handling Large Files

In [None]:
# Create a larger sample file
np.random.seed(42)
large_df = pd.DataFrame({
    'id': range(10000),
    'value': np.random.randn(10000),
    'category': np.random.choice(['A', 'B', 'C'], 10000)
})
large_df.to_csv('large_file.csv', index=False)
print(f"Created large_file.csv with {len(large_df)} rows")

In [None]:
# Read in chunks for memory efficiency
chunk_size = 2000
chunks = []

for chunk in pd.read_csv('large_file.csv', chunksize=chunk_size):
    # Process each chunk (e.g., filter, aggregate)
    filtered = chunk[chunk['value'] > 0]  # Example filter
    chunks.append(filtered)

# Combine processed chunks
result = pd.concat(chunks, ignore_index=True)
print(f"Processed {len(result)} rows with positive values")

---

## 7. Quick Reference: Common Parameters

### read_csv() Parameters:

| Parameter | Description | Example |
|-----------|-------------|--------|
| `sep` | Delimiter | `sep=';'` |
| `header` | Row number for headers | `header=0` |
| `names` | Custom column names | `names=['a','b']` |
| `index_col` | Column to use as index | `index_col='id'` |
| `usecols` | Columns to read | `usecols=['a','b']` |
| `nrows` | Number of rows to read | `nrows=100` |
| `skiprows` | Rows to skip | `skiprows=5` |
| `na_values` | Values to treat as NaN | `na_values=['NA']` |
| `parse_dates` | Parse date columns | `parse_dates=['date']` |
| `dtype` | Column data types | `dtype={'id': int}` |
| `encoding` | File encoding | `encoding='utf-8'` |

### to_csv() Parameters:

| Parameter | Description | Example |
|-----------|-------------|--------|
| `index` | Write index | `index=False` |
| `sep` | Delimiter | `sep=';'` |
| `columns` | Columns to write | `columns=['a','b']` |
| `header` | Write header | `header=False` |
| `mode` | Write mode | `mode='a'` (append) |

---

## üìù Practice Problems

### Problem 1: Create and Read CSV
1. Create a DataFrame with student data: id, name, math_score, science_score
2. Save it to 'student_scores.csv' without the index
3. Read it back and display first 3 rows

In [None]:
# Your solution here

### Problem 2: Read with Custom Parameters
Create a CSV file 'custom_data.csv' with:
- Semicolon separator
- A comment line at the top
- Some 'NA' values

Then read it with appropriate parameters.

In [None]:
# Your solution here

### Problem 3: Selective Reading
Using 'employees.csv':
1. Read only the 'name' and 'department' columns
2. Read only the first 2 rows
3. Read with 'id' as the index

In [None]:
# Your solution here

---

## ‚úÖ Solutions

### Solution 1: Create and Read CSV

In [None]:
# Solution 1
# Create DataFrame
students = pd.DataFrame({
    'id': [1, 2, 3, 4, 5],
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'math_score': [90, 85, 78, 92, 88],
    'science_score': [88, 90, 82, 95, 85]
})

# Save to CSV
students.to_csv('student_scores.csv', index=False)
print("Saved student_scores.csv")

# Read back
df = pd.read_csv('student_scores.csv')
print("\nFirst 3 rows:")
print(df.head(3))

### Solution 2: Read with Custom Parameters

In [None]:
# Solution 2
# Create custom CSV
with open('custom_data.csv', 'w') as f:
    f.write("# Custom data file\n")
    f.write("product;price;stock\n")
    f.write("Apple;1.50;100\n")
    f.write("Banana;NA;150\n")
    f.write("Orange;1.25;NA\n")

print("File content:")
print(open('custom_data.csv').read())

# Read with parameters
df = pd.read_csv(
    'custom_data.csv',
    sep=';',
    comment='#',
    na_values=['NA']
)
print("DataFrame:")
print(df)
print("\nData types:")
print(df.dtypes)

### Solution 3: Selective Reading

In [None]:
# Solution 3

# 1. Read only name and department
df1 = pd.read_csv('employees.csv', usecols=['name', 'department'])
print("1. Only name and department:")
print(df1)

# 2. Read first 2 rows
df2 = pd.read_csv('employees.csv', nrows=2)
print("\n2. First 2 rows:")
print(df2)

# 3. Read with id as index
df3 = pd.read_csv('employees.csv', index_col='id')
print("\n3. With id as index:")
print(df3)

---

## üßπ Cleanup

In [None]:
# Clean up created files
import os

files_to_remove = [
    'employees.csv', 'products.csv', 'products_with_index.csv',
    'products_semicolon.csv', 'complex_data.csv', 'students.xlsx',
    'multi_sheet.xlsx', 'people.json', 'people_output.json',
    'large_file.csv', 'student_scores.csv', 'custom_data.csv'
]

for file in files_to_remove:
    if os.path.exists(file):
        os.remove(file)
        print(f"Removed: {file}")

---

## üìå Summary

| Operation | Function | Key Parameters |
|-----------|----------|----------------|
| **Read CSV** | `pd.read_csv()` | `sep`, `usecols`, `nrows`, `index_col` |
| **Write CSV** | `df.to_csv()` | `index=False`, `sep` |
| **Read Excel** | `pd.read_excel()` | `sheet_name` |
| **Write Excel** | `df.to_excel()` | `sheet_name`, `index` |
| **Read JSON** | `pd.read_json()` | `orient` |
| **Write JSON** | `df.to_json()` | `orient`, `indent` |

**Next:** [27_pandas_exploration.ipynb](27_pandas_exploration.ipynb) - Data exploration and basic analysis