# DataFrame Indexes: Setting, Resetting, and Sorting

## ## 1. Basic Index Operations

In [1]:
import pandas as pd
import numpy as np

# Create sample dataset
data = {
    'employee_id': [101, 102, 103, 104, 105, 106, 107, 108],
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Henry'],
    'department': ['IT', 'HR', 'Finance', 'IT', 'Marketing', 'Finance', 'HR', 'IT'],
    'salary': [75000, 85000, 65000, 78000, 60000, 90000, 62000, 82000],
    'experience': [3, 7, 2, 4, 1, 8, 2, 5],
    'hire_date': pd.date_range('2020-01-01', periods=8, freq='M')
}

df = pd.DataFrame(data)
print("Original DataFrame (default index 0-7):")
print(df)
print(f"\nCurrent index: {df.index}")
print(f"Index name: {df.index.name}")
print(f"Index type: {type(df.index)}")
print("\n" + "="*80)

Original DataFrame (default index 0-7):
   employee_id     name department  salary  experience  hire_date
0          101    Alice         IT   75000           3 2020-01-31
1          102      Bob         HR   85000           7 2020-02-29
2          103  Charlie    Finance   65000           2 2020-03-31
3          104    David         IT   78000           4 2020-04-30
4          105      Eva  Marketing   60000           1 2020-05-31
5          106    Frank    Finance   90000           8 2020-06-30
6          107    Grace         HR   62000           2 2020-07-31
7          108    Henry         IT   82000           5 2020-08-31

Current index: RangeIndex(start=0, stop=8, step=1)
Index name: None
Index type: <class 'pandas.core.indexes.range.RangeIndex'>



  'hire_date': pd.date_range('2020-01-01', periods=8, freq='M')


# 2. Setting Indexes
## Example 2.1: Set Single Column as Index

In [2]:
# Set employee_id as index
df_id_index = df.set_index('employee_id')
print("1. DataFrame with employee_id as index:")
print(df_id_index)
print(f"\nIndex: {df_id_index.index}")
print(f"Index name: {df_id_index.index.name}")

# Access data using index
print("\nAccessing data using index:")
print(f"Employee 104: {df_id_index.loc[104, 'name']}")
print(f"Salary of employee 106: ${df_id_index.loc[106, 'salary']:,}")

# Set index with inplace=True (modifies original)
df_copy = df.copy()
df_copy.set_index('employee_id', inplace=True)
print("\n2. Using inplace=True:")
print(df_copy.head())

1. DataFrame with employee_id as index:
                name department  salary  experience  hire_date
employee_id                                                   
101            Alice         IT   75000           3 2020-01-31
102              Bob         HR   85000           7 2020-02-29
103          Charlie    Finance   65000           2 2020-03-31
104            David         IT   78000           4 2020-04-30
105              Eva  Marketing   60000           1 2020-05-31
106            Frank    Finance   90000           8 2020-06-30
107            Grace         HR   62000           2 2020-07-31
108            Henry         IT   82000           5 2020-08-31

Index: Index([101, 102, 103, 104, 105, 106, 107, 108], dtype='int64', name='employee_id')
Index name: employee_id

Accessing data using index:
Employee 104: David
Salary of employee 106: $90,000

2. Using inplace=True:
                name department  salary  experience  hire_date
employee_id                                    

# Example 2.2: Set Multiple Columns as Index (MultiIndex)

In [3]:
# Set department and name as MultiIndex
df_multi_index = df.set_index(['department', 'name'])
print("DataFrame with MultiIndex (department, name):")
print(df_multi_index)
print(f"\nIndex: {df_multi_index.index}")
print(f"Index levels: {df_multi_index.index.nlevels}")
print(f"Index names: {df_multi_index.index.names}")

# Access data with MultiIndex
print("\nAccessing data with MultiIndex:")
print("IT department, Alice:")
print(df_multi_index.loc[('IT', 'Alice')])

print("\nAll employees in Finance department:")
print(df_multi_index.loc['Finance'])

DataFrame with MultiIndex (department, name):
                    employee_id  salary  experience  hire_date
department name                                               
IT         Alice            101   75000           3 2020-01-31
HR         Bob              102   85000           7 2020-02-29
Finance    Charlie          103   65000           2 2020-03-31
IT         David            104   78000           4 2020-04-30
Marketing  Eva              105   60000           1 2020-05-31
Finance    Frank            106   90000           8 2020-06-30
HR         Grace            107   62000           2 2020-07-31
IT         Henry            108   82000           5 2020-08-31

Index: MultiIndex([(       'IT',   'Alice'),
            (       'HR',     'Bob'),
            (  'Finance', 'Charlie'),
            (       'IT',   'David'),
            ('Marketing',     'Eva'),
            (  'Finance',   'Frank'),
            (       'HR',   'Grace'),
            (       'IT',   'Henry')],
           

# Example 2.3: Set Index with Non-Unique Values

In [4]:
# Create data with duplicate departments
df_duplicates = df.copy()
df_duplicates.loc[8] = [109, 'Ivy', 'IT', 72000, 3, pd.Timestamp('2021-06-01')]

# Set department as index (non-unique)
df_dept_index = df_duplicates.set_index('department')
print("DataFrame with department as index (non-unique values):")
print(df_dept_index)
print(f"\nNote: Index has duplicate 'IT' values")

# Access all rows with 'IT' index
print("\nAll IT department employees:")
print(df_dept_index.loc['IT'])

DataFrame with department as index (non-unique values):
            employee_id     name  salary  experience  hire_date
department                                                     
IT                  101    Alice   75000           3 2020-01-31
HR                  102      Bob   85000           7 2020-02-29
Finance             103  Charlie   65000           2 2020-03-31
IT                  104    David   78000           4 2020-04-30
Marketing           105      Eva   60000           1 2020-05-31
Finance             106    Frank   90000           8 2020-06-30
HR                  107    Grace   62000           2 2020-07-31
IT                  108    Henry   82000           5 2020-08-31
IT                  109      Ivy   72000           3 2021-06-01

Note: Index has duplicate 'IT' values

All IT department employees:
            employee_id   name  salary  experience  hire_date
department                                                   
IT                  101  Alice   75000         

# 3. Resetting Indexes
## Example 3.1: Basic reset_index()

In [5]:
# Start with employee_id as index
df_indexed = df.set_index('employee_id')
print("DataFrame with employee_id index:")
print(df_indexed)

# Reset index (moves index back to column)
df_reset = df_indexed.reset_index()
print("\n1. After reset_index():")
print(df_reset)
print(f"Index reset to: {df_reset.index}")

# Reset with drop=True (discard index)
df_reset_drop = df_indexed.reset_index(drop=True)
print("\n2. After reset_index(drop=True):")
print(df_reset_drop)
print(f"Index column removed, new default index: {df_reset_drop.index}")

DataFrame with employee_id index:
                name department  salary  experience  hire_date
employee_id                                                   
101            Alice         IT   75000           3 2020-01-31
102              Bob         HR   85000           7 2020-02-29
103          Charlie    Finance   65000           2 2020-03-31
104            David         IT   78000           4 2020-04-30
105              Eva  Marketing   60000           1 2020-05-31
106            Frank    Finance   90000           8 2020-06-30
107            Grace         HR   62000           2 2020-07-31
108            Henry         IT   82000           5 2020-08-31

1. After reset_index():
   employee_id     name department  salary  experience  hire_date
0          101    Alice         IT   75000           3 2020-01-31
1          102      Bob         HR   85000           7 2020-02-29
2          103  Charlie    Finance   65000           2 2020-03-31
3          104    David         IT   78000     

# Example 3.2: Reset MultiIndex

In [6]:
# Start with MultiIndex
df_multi = df.set_index(['department', 'name'])
print("MultiIndex DataFrame:")
print(df_multi)

# Reset all levels
df_reset_all = df_multi.reset_index()
print("\n1. Reset all levels:")
print(df_reset_all)

# Reset specific level only
df_reset_level = df_multi.reset_index(level='name')
print("\n2. Reset only 'name' level:")
print(df_reset_level)
print(f"Index after reset: {df_reset_level.index}")

# Reset with col_level and col_fill for MultiIndex columns
df_multi_cols = df_multi.copy()
df_multi_cols.columns = pd.MultiIndex.from_tuples([('data', col) for col in df_multi_cols.columns])
df_reset_multi = df_multi_cols.reset_index()
print("\n3. Reset with MultiIndex columns:")
print(df_reset_multi)

MultiIndex DataFrame:
                    employee_id  salary  experience  hire_date
department name                                               
IT         Alice            101   75000           3 2020-01-31
HR         Bob              102   85000           7 2020-02-29
Finance    Charlie          103   65000           2 2020-03-31
IT         David            104   78000           4 2020-04-30
Marketing  Eva              105   60000           1 2020-05-31
Finance    Frank            106   90000           8 2020-06-30
HR         Grace            107   62000           2 2020-07-31
IT         Henry            108   82000           5 2020-08-31

1. Reset all levels:
  department     name  employee_id  salary  experience  hire_date
0         IT    Alice          101   75000           3 2020-01-31
1         HR      Bob          102   85000           7 2020-02-29
2    Finance  Charlie          103   65000           2 2020-03-31
3         IT    David          104   78000           4 2020-04

# Example 3.3: Reset Index with inplace

In [7]:
df_temp = df.set_index('employee_id').copy()
print("Before reset (inplace):")
print(df_temp.head())

df_temp.reset_index(inplace=True)
print("\nAfter reset_index(inplace=True):")
print(df_temp.head())

Before reset (inplace):
                name department  salary  experience  hire_date
employee_id                                                   
101            Alice         IT   75000           3 2020-01-31
102              Bob         HR   85000           7 2020-02-29
103          Charlie    Finance   65000           2 2020-03-31
104            David         IT   78000           4 2020-04-30
105              Eva  Marketing   60000           1 2020-05-31

After reset_index(inplace=True):
   employee_id     name department  salary  experience  hire_date
0          101    Alice         IT   75000           3 2020-01-31
1          102      Bob         HR   85000           7 2020-02-29
2          103  Charlie    Finance   65000           2 2020-03-31
3          104    David         IT   78000           4 2020-04-30
4          105      Eva  Marketing   60000           1 2020-05-31


# 4. Sorting Indexes
## Example 4.1: sort_index() Basics

In [8]:
# Create unsorted index
df_unsorted = df.set_index('name').sample(frac=1, random_state=42)
print("DataFrame with unsorted name index:")
print(df_unsorted)

# Sort index alphabetically
df_sorted_asc = df_unsorted.sort_index()
print("\n1. Sorted index (ascending A-Z):")
print(df_sorted_asc)

# Sort index descending
df_sorted_desc = df_unsorted.sort_index(ascending=False)
print("\n2. Sorted index (descending Z-A):")
print(df_sorted_desc)

# Sort with null handling
df_with_nulls = df_unsorted.copy()
df_with_nulls.loc['Unknown'] = [999, 'Unknown', 0, 0, pd.NaT]
df_sorted_nulls = df_with_nulls.sort_index(na_position='first')
print("\n3. Sorted with nulls first:")
print(df_sorted_nulls)

DataFrame with unsorted name index:
         employee_id department  salary  experience  hire_date
name                                                          
Bob              102         HR   85000           7 2020-02-29
Frank            106    Finance   90000           8 2020-06-30
Alice            101         IT   75000           3 2020-01-31
Henry            108         IT   82000           5 2020-08-31
Charlie          103    Finance   65000           2 2020-03-31
Eva              105  Marketing   60000           1 2020-05-31
David            104         IT   78000           4 2020-04-30
Grace            107         HR   62000           2 2020-07-31

1. Sorted index (ascending A-Z):
         employee_id department  salary  experience  hire_date
name                                                          
Alice            101         IT   75000           3 2020-01-31
Bob              102         HR   85000           7 2020-02-29
Charlie          103    Finance   65000         

  df_with_nulls.loc['Unknown'] = [999, 'Unknown', 0, 0, pd.NaT]


# Example 4.2: Sorting MultiIndex

In [9]:
# Create MultiIndex with unsorted values
df_multi_unsorted = df.set_index(['department', 'name']).sample(frac=1, random_state=42)
print("MultiIndex DataFrame (unsorted):")
print(df_multi_unsorted)

# Sort by all levels
df_multi_sorted = df_multi_unsorted.sort_index()
print("\n1. Sorted by all levels (department then name):")
print(df_multi_sorted)

# Sort by specific level
df_sorted_level0 = df_multi_unsorted.sort_index(level=0)  # department
print("\n2. Sorted by level 0 only (department):")
print(df_sorted_level0)

# Sort levels in different directions
df_mixed_sort = df_multi_unsorted.sort_index(level=[0, 1], ascending=[True, False])
print("\n3. Department A-Z, Name Z-A:")
print(df_mixed_sort)

MultiIndex DataFrame (unsorted):
                    employee_id  salary  experience  hire_date
department name                                               
HR         Bob              102   85000           7 2020-02-29
Finance    Frank            106   90000           8 2020-06-30
IT         Alice            101   75000           3 2020-01-31
           Henry            108   82000           5 2020-08-31
Finance    Charlie          103   65000           2 2020-03-31
Marketing  Eva              105   60000           1 2020-05-31
IT         David            104   78000           4 2020-04-30
HR         Grace            107   62000           2 2020-07-31

1. Sorted by all levels (department then name):
                    employee_id  salary  experience  hire_date
department name                                               
Finance    Charlie          103   65000           2 2020-03-31
           Frank            106   90000           8 2020-06-30
HR         Bob              102   85

# Example 4.3: sort_index() with Custom Sorting

In [10]:
# Create custom department order
dept_order = ['Finance', 'IT', 'HR', 'Marketing']

# Convert to categorical with custom order
df_custom = df.copy()
df_custom['department'] = pd.Categorical(df_custom['department'], 
                                        categories=dept_order, 
                                        ordered=True)

# Set index and sort
df_custom_index = df_custom.set_index(['department', 'name']).sort_index()
print("Sorted with custom department order (Finance → IT → HR → Marketing):")
print(df_custom_index)

Sorted with custom department order (Finance → IT → HR → Marketing):
                    employee_id  salary  experience  hire_date
department name                                               
Finance    Charlie          103   65000           2 2020-03-31
           Frank            106   90000           8 2020-06-30
IT         Alice            101   75000           3 2020-01-31
           David            104   78000           4 2020-04-30
           Henry            108   82000           5 2020-08-31
HR         Bob              102   85000           7 2020-02-29
           Grace            107   62000           2 2020-07-31
Marketing  Eva              105   60000           1 2020-05-31


# 5. Index Operations in Practice
## Example 5.1: Time Series Index

In [11]:
# Create time series data
dates = pd.date_range('2024-01-01', periods=10, freq='D')
time_data = pd.DataFrame({
    'date': dates,
    'sales': np.random.randint(100, 1000, 10),
    'temperature': np.random.randint(20, 35, 10),
    'rainfall': np.random.rand(10) * 10
})

# Set date as index
time_series = time_data.set_index('date')
print("Time Series with date index:")
print(time_series)

# Sort time index
time_sorted = time_series.sort_index()
print("\nSorted by date:")
print(time_sorted)

# Date-based slicing
print("\nDate-based operations:")
print(f"January 3-5, 2024:")
print(time_sorted.loc['2024-01-03':'2024-01-05'])

print(f"\nAll of January 2024:")
print(time_sorted.loc['2024-01'])

Time Series with date index:
            sales  temperature  rainfall
date                                    
2024-01-01    681           23  0.144641
2024-01-02    506           28  5.688241
2024-01-03    512           34  5.498679
2024-01-04    838           24  5.593980
2024-01-05    932           34  8.828648
2024-01-06    104           27  7.944162
2024-01-07    372           24  8.612475
2024-01-08    816           27  2.064229
2024-01-09    988           33  9.280950
2024-01-10    336           29  3.953991

Sorted by date:
            sales  temperature  rainfall
date                                    
2024-01-01    681           23  0.144641
2024-01-02    506           28  5.688241
2024-01-03    512           34  5.498679
2024-01-04    838           24  5.593980
2024-01-05    932           34  8.828648
2024-01-06    104           27  7.944162
2024-01-07    372           24  8.612475
2024-01-08    816           27  2.064229
2024-01-09    988           33  9.280950
2024-01-10 

# Example 5.2: Hierarchical Data Analysis

In [12]:
# Create hierarchical sales data
sales_data = {
    'region': ['North', 'North', 'North', 'South', 'South', 'South', 'East', 'East'],
    'city': ['NYC', 'Boston', 'NYC', 'Atlanta', 'Miami', 'Atlanta', 'Chicago', 'Detroit'],
    'product': ['A', 'B', 'A', 'B', 'A', 'A', 'B', 'A'],
    'sales': [1000, 1500, 1200, 800, 900, 1100, 1300, 700],
    'profit': [200, 300, 240, 160, 180, 220, 260, 140]
}

sales_df = pd.DataFrame(sales_data)

# Set hierarchical index
sales_indexed = sales_df.set_index(['region', 'city', 'product']).sort_index()
print("Sales data with hierarchical index (region → city → product):")
print(sales_indexed)

# Query at different levels
print("\nQuery examples:")
print("1. All sales in North region:")
print(sales_indexed.loc['North'])

print("\n2. All sales in NYC:")
print(sales_indexed.xs('NYC', level='city'))

print("\n3. Product A sales across all regions:")
print(sales_indexed.xs('A', level='product'))

Sales data with hierarchical index (region → city → product):
                        sales  profit
region city    product               
East   Chicago B         1300     260
       Detroit A          700     140
North  Boston  B         1500     300
       NYC     A         1000     200
               A         1200     240
South  Atlanta A         1100     220
               B          800     160
       Miami   A          900     180

Query examples:
1. All sales in North region:
                sales  profit
city   product               
Boston B         1500     300
NYC    A         1000     200
       A         1200     240

2. All sales in NYC:
                sales  profit
region product               
North  A         1000     200
       A         1200     240

3. Product A sales across all regions:
                sales  profit
region city                  
East   Detroit    700     140
North  NYC       1000     200
       NYC       1200     240
South  Atlanta   1100     220

# 6. Advanced Index Techniques
## Example 6.1: Index Alignment in Operations

In [13]:
# Create two DataFrames with different indexes
df1 = pd.DataFrame({'A': [1, 2, 3]}, index=['a', 'b', 'c'])
df2 = pd.DataFrame({'B': [4, 5, 6]}, index=['b', 'c', 'd'])

print("DataFrame 1:")
print(df1)
print("\nDataFrame 2:")
print(df2)

# Index alignment in addition
result = df1 + df2
print("\nAddition with index alignment:")
print(result)
print("Note: Only matching indexes (b, c) have results")

# Reindex to align indexes
df1_reindexed = df1.reindex(df2.index)
print("\nDataFrame 1 reindexed to match DataFrame 2:")
print(df1_reindexed)

DataFrame 1:
   A
a  1
b  2
c  3

DataFrame 2:
   B
b  4
c  5
d  6

Addition with index alignment:
    A   B
a NaN NaN
b NaN NaN
c NaN NaN
d NaN NaN
Note: Only matching indexes (b, c) have results

DataFrame 1 reindexed to match DataFrame 2:
     A
b  2.0
c  3.0
d  NaN


## Example 6.2: Index with Missing Values

In [14]:
# Create index with missing values
df_missing_idx = pd.DataFrame({
    'value': [10, 20, 30, 40]
}, index=['A', 'B', None, 'D'])

print("DataFrame with None in index:")
print(df_missing_idx)

# Sort with nulls
sorted_nulls_last = df_missing_idx.sort_index(na_position='last')
print("\nSorted with nulls last:")
print(sorted_nulls_last)

sorted_nulls_first = df_missing_idx.sort_index(na_position='first')
print("\nSorted with nulls first:")
print(sorted_nulls_first)

DataFrame with None in index:
      value
A        10
B        20
None     30
D        40

Sorted with nulls last:
      value
A        10
B        20
D        40
None     30

Sorted with nulls first:
      value
None     30
A        10
B        20
D        40


## Example 6.3: Renaming Index

In [15]:
# Create DataFrame with index
df_idx = df.set_index('employee_id')
print("Original indexed DataFrame:")
print(df_idx)

# Rename index
df_idx_renamed = df_idx.rename_axis('emp_id')
print("\nAfter rename_axis('emp_id'):")
print(df_idx_renamed)
print(f"Index name: {df_idx_renamed.index.name}")

# Rename index with mapping
df_idx_mapped = df_idx.rename(index={101: 'EMP_101', 102: 'EMP_102'})
print("\nAfter renaming specific index values:")
print(df_idx_mapped.head())

Original indexed DataFrame:
                name department  salary  experience  hire_date
employee_id                                                   
101            Alice         IT   75000           3 2020-01-31
102              Bob         HR   85000           7 2020-02-29
103          Charlie    Finance   65000           2 2020-03-31
104            David         IT   78000           4 2020-04-30
105              Eva  Marketing   60000           1 2020-05-31
106            Frank    Finance   90000           8 2020-06-30
107            Grace         HR   62000           2 2020-07-31
108            Henry         IT   82000           5 2020-08-31

After rename_axis('emp_id'):
           name department  salary  experience  hire_date
emp_id                                                   
101       Alice         IT   75000           3 2020-01-31
102         Bob         HR   85000           7 2020-02-29
103     Charlie    Finance   65000           2 2020-03-31
104       David       

## 7. Performance Considerations

In [16]:
# Create large dataset
np.random.seed(42)
large_df = pd.DataFrame({
    'id': range(1000000),
    'category': np.random.choice(['A', 'B', 'C', 'D', 'E'], 1000000),
    'value': np.random.randn(1000000)
})

print(f"Large DataFrame: {len(large_df):,} rows")

import time

# Performance test 1: Setting index vs not
start = time.time()
result1 = large_df[large_df['category'] == 'A']
time1 = time.time() - start
print(f"1. Filter without index: {time1:.4f} seconds")

start = time.time()
df_indexed = large_df.set_index('category')
result2 = df_indexed.loc['A']
time2 = time.time() - start
print(f"2. Filter with index (including set_index time): {time2:.4f} seconds")

# Performance test 2: Sorting
start = time.time()
sorted_by_value = large_df.sort_values('value')
time3 = time.time() - start
print(f"\n3. Sort by value column: {time3:.4f} seconds")

start = time.time()
sorted_by_index = df_indexed.sort_index()
time4 = time.time() - start
print(f"4. Sort by index: {time4:.4f} seconds")

print(f"\nIndex sorting is {time3/time4:.1f}x faster than column sorting")

Large DataFrame: 1,000,000 rows
1. Filter without index: 0.4077 seconds
2. Filter with index (including set_index time): 0.0584 seconds

3. Sort by value column: 0.2739 seconds
4. Sort by index: 0.5540 seconds

Index sorting is 0.5x faster than column sorting


# 8. Real-World Use Cases
## Example 8.1: Student Gradebook System

In [17]:
# Student gradebook
grades_data = {
    'student_id': ['S001', 'S002', 'S003', 'S004', 'S005'],
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Evan'],
    'math': [85, 92, 78, 88, 95],
    'science': [90, 88, 82, 91, 94],
    'english': [88, 85, 90, 87, 92]
}

grades = pd.DataFrame(grades_data)
print("Original gradebook:")
print(grades)

# Set student_id as index for quick lookups
grades_indexed = grades.set_index('student_id')
print("\nGradebook with student_id index:")
print(grades_indexed)

# Calculate averages
grades_indexed['average'] = grades_indexed[['math', 'science', 'english']].mean(axis=1)
grades_indexed['grade'] = pd.cut(grades_indexed['average'], 
                                bins=[0, 60, 70, 80, 90, 100],
                                labels=['F', 'D', 'C', 'B', 'A'])

# Sort by average grade
grades_sorted = grades_indexed.sort_values('average', ascending=False)
print("\nGradebook sorted by average (highest first):")
print(grades_sorted[['name', 'average', 'grade']])

# Reset index for reporting
report = grades_sorted.reset_index()
print("\nFinal report (reset index):")
print(report[['student_id', 'name', 'average', 'grade']])

Original gradebook:
  student_id     name  math  science  english
0       S001    Alice    85       90       88
1       S002      Bob    92       88       85
2       S003  Charlie    78       82       90
3       S004    Diana    88       91       87
4       S005     Evan    95       94       92

Gradebook with student_id index:
               name  math  science  english
student_id                                 
S001          Alice    85       90       88
S002            Bob    92       88       85
S003        Charlie    78       82       90
S004          Diana    88       91       87
S005           Evan    95       94       92

Gradebook sorted by average (highest first):
               name    average grade
student_id                          
S005           Evan  93.666667     A
S004          Diana  88.666667     B
S002            Bob  88.333333     B
S001          Alice  87.666667     B
S003        Charlie  83.333333     B

Final report (reset index):
  student_id     name    ave

## Example 8.2: Stock Price Analysis

In [18]:
# Stock price data
dates = pd.date_range('2024-01-01', '2024-01-10', freq='B')
stocks = pd.DataFrame({
    'date': np.repeat(dates, 3),
    'ticker': ['AAPL', 'GOOGL', 'MSFT'] * len(dates),
    'price': np.random.uniform(100, 500, len(dates) * 3),
    'volume': np.random.randint(1000, 10000, len(dates) * 3)
})

print("Stock data (first 10 rows):")
print(stocks.head(10))

# Set hierarchical index (date, ticker)
stocks_indexed = stocks.set_index(['date', 'ticker']).sort_index()
print("\nStock data with hierarchical index:")
print(stocks_indexed.head(10))

# Calculate daily returns
stocks_indexed['prev_price'] = stocks_indexed.groupby('ticker')['price'].shift(1)
stocks_indexed['daily_return'] = (stocks_indexed['price'] - stocks_indexed['prev_price']) / stocks_indexed['prev_price']

# Reset index for pivot table
stocks_reset = stocks_indexed.reset_index()
pivot = stocks_reset.pivot_table(
    index='date',
    columns='ticker',
    values='daily_return'
)

print("\nDaily returns pivot table:")
print(pivot)

Stock data (first 10 rows):
        date ticker       price  volume
0 2024-01-01   AAPL  241.763487    2106
1 2024-01-01  GOOGL  443.449582    6929
2 2024-01-01   MSFT  453.706054    5285
3 2024-01-02   AAPL  105.428238    8753
4 2024-01-02  GOOGL  310.082503    1599
5 2024-01-02   MSFT  309.151980    2577
6 2024-01-03   AAPL  362.063414    5934
7 2024-01-03  GOOGL  335.725493    9464
8 2024-01-03   MSFT  406.084851    7485
9 2024-01-04   AAPL  368.965148    8363

Stock data with hierarchical index:
                        price  volume
date       ticker                    
2024-01-01 AAPL    241.763487    2106
           GOOGL   443.449582    6929
           MSFT    453.706054    5285
2024-01-02 AAPL    105.428238    8753
           GOOGL   310.082503    1599
           MSFT    309.151980    2577
2024-01-03 AAPL    362.063414    5934
           GOOGL   335.725493    9464
           MSFT    406.084851    7485
2024-01-04 AAPL    368.965148    8363

Daily returns pivot table:
ticker     

# 9. Common Patterns and Best Practices

In [19]:
print("="*80)
print("INDEX OPERATIONS - BEST PRACTICES")
print("="*80)

patterns = [
    ("Set meaningful indexes", 
     "✓ Use set_index() for columns used in frequent lookups"),
    
    ("Sort after setting index", 
     "✓ Always sort_index() after set_index() for performance"),
    
    ("Reset for operations", 
     "✓ Use reset_index() before operations that need default index"),
    
    ("Keep original data", 
     "✓ Use copy() before inplace operations: df_copy = df.copy()"),
    
    ("Handle duplicates", 
     "✓ Check for duplicates: df.index.is_unique"),
    
    ("MultiIndex for hierarchies", 
     "✓ Use set_index([col1, col2]) for hierarchical data"),
    
    ("Time series", 
     "✓ Always set date as index for time series data"),
    
    ("Export preparation", 
     "✓ Reset index before exporting to CSV/Excel"),
]

for pattern, practice in patterns:
    print(f"\n{pattern}:")
    print(f"  {practice}")

print("\n" + "="*80)
print("COMMON MISTAKES TO AVOID")
print("="*80)

mistakes = [
    "✗ Forgetting to sort after setting index",
    "✗ Using non-unique values as index without understanding implications",
    "✗ Not resetting index before certain operations (like merging)",
    "✗ Using inplace=True without making a copy first",
    "✗ Setting index on temporary DataFrames",
    "✗ Ignoring index when performing arithmetic operations",
]

for mistake in mistakes:
    print(f"- {mistake}")

INDEX OPERATIONS - BEST PRACTICES

Set meaningful indexes:
  ✓ Use set_index() for columns used in frequent lookups

Sort after setting index:
  ✓ Always sort_index() after set_index() for performance

Reset for operations:
  ✓ Use reset_index() before operations that need default index

Keep original data:
  ✓ Use copy() before inplace operations: df_copy = df.copy()

Handle duplicates:
  ✓ Check for duplicates: df.index.is_unique

MultiIndex for hierarchies:
  ✓ Use set_index([col1, col2]) for hierarchical data

Time series:
  ✓ Always set date as index for time series data

Export preparation:
  ✓ Reset index before exporting to CSV/Excel

COMMON MISTAKES TO AVOID
- ✗ Forgetting to sort after setting index
- ✗ Using non-unique values as index without understanding implications
- ✗ Not resetting index before certain operations (like merging)
- ✗ Using inplace=True without making a copy first
- ✗ Setting index on temporary DataFrames
- ✗ Ignoring index when performing arithmetic opera