<a href="https://colab.research.google.com/github/yellowgram1543/6-Stages-of-AIML/blob/main/AIML0_Day4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**What are Index Objects?**

Index objects in pandas are immutable array-like structures that provide the **axis labels** for pandas Series and DataFrames. They serve as the backbone for data alignment, selection, and organization in pandas.

In [5]:
import pandas as pd
import numpy as np

# Create a basic Index
idx1 = pd.Index(['apple', 'banana', 'cherry'])
print(idx1)
# Basic string index

# Create Index from a list
idx2 = pd.Index([1, 2, 3, 4, 5])
# Numeric index from list

# Create Index with custom name
idx3 = pd.Index(['A', 'B', 'C'], name='letters')
# Named index for better identification

# Create Index from NumPy array
arr = np.array(['x', 'y', 'z'])
idx4 = pd.Index(arr)
# Index created from NumPy array

# RangeIndex (memory efficient for sequential integers)
idx5 = pd.RangeIndex(start=0, stop=10, step=1)
# Efficient index for sequential integers

# Create Index with specific dtype
idx6 = pd.Index([1.1, 2.2, 3.3], dtype='float64')
# Index with explicit data type

print("\n")

Index(['apple', 'banana', 'cherry'], dtype='object')




### Set Operations

In [None]:
idx1 = pd.Index(['a', 'b', 'c', 'd'])
idx2 = pd.Index(['c', 'd', 'e', 'f'])

# Union of two indexes
union_idx = idx1.union(idx2)
print(f"Union: {union_idx}\n")
# Combine all unique elements from both indexes

# Intersection of two indexes
intersection_idx = idx1.intersection(idx2)
print(f"Intersection: {intersection_idx}\n")
# Get common elements between indexes

# Difference between indexes
difference_idx = idx1.difference(idx2)
print(f"Difference (idx1 - idx2): {difference_idx}\n")
# Get elements in idx1 but not in idx2

# Symmetric difference
sym_diff = idx1.symmetric_difference(idx2)
print(f"Symmetric Difference: {sym_diff}\n")
# Get elements in either index but not both

Union: Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')

Intersection: Index(['c', 'd'], dtype='object')

Difference (idx1 - idx2): Index(['a', 'b'], dtype='object')

Symmetric Difference: Index(['a', 'b', 'e', 'f'], dtype='object')



### **Membership and Comparison**

In [None]:
idx = pd.Index(['apple', 'banana', 'cherry'])

# Check if element exists in index
has_banana = 'banana' in idx
print(f"'banana' in idx: {has_banana}\n")


# Check if all elements of another index are in this index
other_idx = pd.Index(['apple', 'cherry'])
is_subset = other_idx.isin(idx).all()
print(f"other_idx is subset of idx: {is_subset}\n")


# Get positions of elements
positions = idx.get_indexer(['banana', 'apple'])
print(f"Positions of ['banana', 'apple']: {positions}\n")


# Check if index contains any of the specified elements
contains_any = idx.isin(['banana', 'grape']).any()
print(f"idx contains any of ['banana', 'grape']: {contains_any}\n")

'banana' in idx: True

other_idx is subset of idx: True

Positions of ['banana', 'apple']: [1 0]

idx contains any of ['banana', 'grape']: True



### Specialized Index Types

**RangeIndex**

In [None]:
# Create RangeIndex
range_idx = pd.RangeIndex(0, 10, 2)
print(f"RangeIndex: {range_idx}\n")

# Convert to regular Index
regular_idx = pd.Index(range_idx.to_list())
print(f"Converted to regular Index: {regular_idx}\n")

# Check if it's a RangeIndex
is_range = isinstance(range_idx, pd.RangeIndex)
print(f"Is RangeIndex: {is_range}\n")

RangeIndex: RangeIndex(start=0, stop=10, step=2)

Converted to regular Index: Index([0, 2, 4, 6, 8], dtype='int64')

Is RangeIndex: True



In [None]:
# Create MultiIndex from arrays
arrays = [['A', 'A', 'B', 'B'], [1, 2, 1, 2]]
multi_idx = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])
print(f"MultiIndex from arrays: {multi_idx}\n")

# Create MultiIndex from product
levels = [['A', 'B'], [1, 2, 3]]
multi_idx2 = pd.MultiIndex.from_product(levels, names=['letter', 'number'])
print(f"MultiIndex from product: {multi_idx2}\n")

# Access levels
level_0 = multi_idx.get_level_values(0)
print(f"Level 0 values: {level_0}\n")

MultiIndex from arrays: MultiIndex([('A', 1),
            ('A', 2),
            ('B', 1),
            ('B', 2)],
           names=['first', 'second'])

MultiIndex from product: MultiIndex([('A', 1),
            ('A', 2),
            ('A', 3),
            ('B', 1),
            ('B', 2),
            ('B', 3)],
           names=['letter', 'number'])

Level 0 values: Index(['A', 'A', 'B', 'B'], dtype='object', name='first')



**Reindexing and Alignment**

| Argument    | Description |
|-------------|-------------|
| **labels**  | New sequence to use as an index. Can be any sequence-like object. If an Index is provided, it is used as-is without copying. |
| **index**   | New index (row labels) to use. Equivalent to `reindex(index=...)`. |
| **columns** | New column labels to use. Equivalent to `reindex(columns=...)`. |
| **axis**    | Axis to reindex: `"index"` (rows, default) or `"columns"`. |
| **method**  | Fill strategy for missing labels: `"ffill"` (forward fill) or `"bfill"` (backward fill). |
| **fill_value** | Value to insert for missing labels created during reindexing; defaults to `NaN` if not provided. |
| **limit**   | When using forward/backward fill, the maximum number of items to fill. |
| **tolerance** | When forward/backfilling with inexact matches, sets the max numeric distance allowed. |
| **level**   | For MultiIndex: match on a specific index level or select subset. |
| **copy**    | If `True`, always copy data even if index is unchanged; if `False`, avoid copying when possible. |


In [None]:
original_idx = pd.Index(['a', 'b', 'c', 'd'])
new_idx = pd.Index(['b', 'd', 'e', 'a'])

# Reindex operation: Align index to new target index
reindexed = original_idx.reindex(new_idx)
print(f"Reindexed Index: {reindexed}\n")

# Get indexer for reindexing: Get positions for reindexing operation
indexer = original_idx.get_indexer(new_idx)
print(f"Indexer for reindexing: {indexer}\n")

Reindexed Index: (Index(['b', 'd', 'e', 'a'], dtype='object'), array([ 1,  3, -1,  0]))

Indexer for reindexing: [ 1  3 -1  0]



### **Filtering in Pandas**

In [None]:
import pandas as pd
import numpy as np

# Create sample DataFrame
df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'age': [25, 30, 35, 28, 22],
    'salary': [50000, 60000, 70000, 55000, 45000],
    'department': ['IT', 'HR', 'IT', 'Finance', 'IT']
})
print(df)
print("\n")

# Filter rows where age is greater than 25
filtered_df = df[df['age'] > 25]
# Select rows where age column values exceed 25
print("rows where age column values exceed 25 \n", filtered_df)
print("\n")

# Filter rows where department is 'IT'
it_employees = df[df['department'] == 'IT']
# Select all employees working in IT department
print("all employees working in IT department \n", it_employees)
print("\n")

# Filter using multiple conditions with & (and)
senior_it = df[(df['age'] > 25) & (df['department'] == 'IT')]
# Select IT employees older than 25 using bitwise AND
print("IT employees older than 25 \n", senior_it)
print("\n")

# Filter using multiple conditions with | (or)
young_or_high_paid = df[(df['age'] < 25) | (df['salary'] > 60000)]
# Select employees who are either young (<25) or highly paid (>60k)
print("employees who are either young (<25) or highly paid (>60k) \n", young_or_high_paid)
print("\n")

      name  age  salary department
0    Alice   25   50000         IT
1      Bob   30   60000         HR
2  Charlie   35   70000         IT
3    Diana   28   55000    Finance
4      Eve   22   45000         IT


rows where age column values exceed 25 
       name  age  salary department
1      Bob   30   60000         HR
2  Charlie   35   70000         IT
3    Diana   28   55000    Finance


all employees working in IT department 
       name  age  salary department
0    Alice   25   50000         IT
2  Charlie   35   70000         IT
4      Eve   22   45000         IT


IT employees older than 25 
       name  age  salary department
2  Charlie   35   70000         IT


employees who are either young (<25) or highly paid (>60k) 
       name  age  salary department
2  Charlie   35   70000         IT
4      Eve   22   45000         IT




**Using isin() Method**

In [None]:
# Filter rows where department is in a list of values
specific_deps = df[df['department'].isin(['IT', 'HR'])]
# Select employees from IT or HR departments using isin()
print(specific_deps)
print("\n")

# Filter rows where name is not in a list
exclude_names = df[~df['name'].isin(['Alice', 'Bob'])]
# Exclude specific names using negation operator with isin()
print(exclude_names)
print("\n")

      name  age  salary department
0    Alice   25   50000         IT
1      Bob   30   60000         HR
2  Charlie   35   70000         IT
4      Eve   22   45000         IT


      name  age  salary department
2  Charlie   35   70000         IT
3    Diana   28   55000    Finance
4      Eve   22   45000         IT




**String Filtering**

In [None]:
# Create DataFrame with string data
df_strings = pd.DataFrame({
    'product': ['Laptop Pro', 'Phone Basic', 'Tablet Ultra', 'Watch Sport', 'Headphones Pro'],
    'category': ['Electronics', 'Mobile', 'Electronics', 'Wearables', 'Audio']
})
print(df_strings)
print("\n")

# Filter rows where product contains 'Pro'
pro_products = df_strings[df_strings['product'].str.contains('Pro')]
# Select products with 'Pro' in their name using string contains
print(pro_products)
print("\n")

# Filter rows where category starts with 'E'
electronics = df_strings[df_strings['category'].str.startswith('E')]
# Select categories starting with letter 'E'
print(electronics)
print("\n")

# Filter with case-insensitive string matching
case_insensitive = df_strings[df_strings['product'].str.contains('pro', case=False)]
# Case-insensitive search for 'pro' in product names
print(case_insensitive)
print("\n")

          product     category
0      Laptop Pro  Electronics
1     Phone Basic       Mobile
2    Tablet Ultra  Electronics
3     Watch Sport    Wearables
4  Headphones Pro        Audio


          product     category
0      Laptop Pro  Electronics
4  Headphones Pro        Audio


        product     category
0    Laptop Pro  Electronics
2  Tablet Ultra  Electronics


          product     category
0      Laptop Pro  Electronics
4  Headphones Pro        Audio




**Using query() Method**

In [None]:
print(df)
print("\n")

# Filter using query method with column names as variables
high_earners = df.query('salary > 55000')
# Select employees earning more than 55k using query syntax
print(high_earners)
print("\n")

# Filter with multiple conditions in query
complex_query = df.query('age > 25 and department == "IT"')
# Complex filtering using query with multiple conditions
print(complex_query)
print("\n")

# Use variables in query with @ symbol
min_age = 27
age_filtered = df.query('age > @min_age')
# Use external variable in query with @ prefix
print(age_filtered)
print("\n")

      name  age  salary department
0    Alice   25   50000         IT
1      Bob   30   60000         HR
2  Charlie   35   70000         IT
3    Diana   28   55000    Finance
4      Eve   22   45000         IT


      name  age  salary department
1      Bob   30   60000         HR
2  Charlie   35   70000         IT


      name  age  salary department
2  Charlie   35   70000         IT


      name  age  salary department
1      Bob   30   60000         HR
2  Charlie   35   70000         IT
3    Diana   28   55000    Finance




**Advanced Filtering Techniques**

In [None]:
# Create DataFrame for advanced examples
df_advanced = pd.DataFrame({
    'score': [85, 92, 78, 96, 88, 73, 91, 82],
    'subject': ['Math', 'Science', 'Math', 'Science', 'Math', 'Science', 'Math', 'Science'],
    'student_id': [1, 2, 3, 4, 5, 6, 7, 8]
})
print(df_advanced)
print("\n")

# Filter using between() method
good_scores = df_advanced[df_advanced['score'].between(80, 90)]
# Select scores between 80 and 90 inclusive
print(good_scores)
print("\n")

# Filter using quantile-based conditions
above_median = df_advanced[df_advanced['score'] > df_advanced['score'].quantile(0.5)]
# Select scores above the median value
print(above_median)
print("\n")

# Filter using custom function with apply
def is_excellent(row):
    return row['score'] > 90 and row['subject'] == 'Science'

excellent_science = df_advanced[df_advanced.apply(is_excellent, axis=1)]
# Apply custom function to filter rows based on multiple conditions
print(excellent_science)
print("\n")

   score  subject  student_id
0     85     Math           1
1     92  Science           2
2     78     Math           3
3     96  Science           4
4     88     Math           5
5     73  Science           6
6     91     Math           7
7     82  Science           8


   score  subject  student_id
0     85     Math           1
4     88     Math           5
7     82  Science           8


   score  subject  student_id
1     92  Science           2
3     96  Science           4
4     88     Math           5
6     91     Math           7


   score  subject  student_id
1     92  Science           2
3     96  Science           4




**Filtering Categorical Data**

In [None]:
# Create DataFrame with categorical data
df_cat = pd.DataFrame({
    'size': pd.Categorical(['Small', 'Medium', 'Large', 'Medium', 'Small'],
                          categories=['Small', 'Medium', 'Large'], ordered=True),
    'price': [10, 15, 20, 15, 10]
})
print(df_cat)
print("\n")

# Filter categorical data using category values
medium_items = df_cat[df_cat['size'] == 'Medium']
# Select items with Medium size category
print(medium_items)
print("\n")

# Filter using categorical ordering
large_or_bigger = df_cat[df_cat['size'] >= 'Medium']
# Use ordered categorical comparison to get Medium and Large items
print(large_or_bigger)
print("\n")

     size  price
0   Small     10
1  Medium     15
2   Large     20
3  Medium     15
4   Small     10


     size  price
1  Medium     15
3  Medium     15


     size  price
1  Medium     15
2   Large     20
3  Medium     15




**Filtering with Duplicate Handling**

In [None]:
# Create DataFrame with duplicates
df_dupes = pd.DataFrame({
    'id': [1, 2, 2, 3, 4, 4, 4],
    'value': [10, 20, 20, 30, 40, 40, 50]
})
print(df_dupes)
print("\n")

# Filter to keep only duplicate rows
only_duplicates = df_dupes[df_dupes.duplicated(keep=False)]
# Keep all rows that have duplicates (including first occurrence)
print(only_duplicates)
print("\n")

# Filter to keep only unique rows
only_unique = df_dupes[~df_dupes.duplicated()]
# Keep only the first occurrence of each duplicate group
print(only_unique)
print("\n")

# Filter based on duplicate count
value_counts = df_dupes['id'].value_counts()
frequent_ids = df_dupes[df_dupes['id'].isin(value_counts[value_counts > 1].index)]
# Select rows where id appears more than once
print(frequent_ids)
print("\n")

   id  value
0   1     10
1   2     20
2   2     20
3   3     30
4   4     40
5   4     40
6   4     50


   id  value
1   2     20
2   2     20
4   4     40
5   4     40


   id  value
0   1     10
1   2     20
3   3     30
4   4     40
6   4     50


   id  value
1   2     20
2   2     20
4   4     40
5   4     40
6   4     50




**Operations between DataFrame and Series**

In [None]:
arr = np.arange(12.).reshape((3, 4))
print(arr)
arr - arr[0]

[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]]


array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

### Pandas Function Application and Mapping

In [None]:
import pandas as pd
import numpy as np

# Create sample Series
s = pd.Series([1, 2, 3, 4, 5])

# Apply built-in NumPy function to Series
s_squared = s.apply(np.square)
# Square each element using NumPy square function
print(s_squared)
print("\n")

# Apply custom lambda function to Series
s_cubed = s.apply(lambda x: x ** 3)
# Cube each element using lambda function
print(s_cubed)
print("\n")

# Apply custom function with multiple operations
def custom_transform(x):
    return (x * 2) + 10

s_transformed = s.apply(custom_transform)
# Apply custom function that doubles and adds 10 to each element
print(s_transformed)
print("\n")

0     1
1     4
2     9
3    16
4    25
dtype: int64


0      1
1      8
2     27
3     64
4    125
dtype: int64


0    12
1    14
2    16
3    18
4    20
dtype: int64




In [None]:
# Create sample DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3, 4],
    'B': [10, 20, 30, 40],
    'C': [100, 200, 300, 400]
})
print(df)
print("\n")

# Apply function to each column (default axis=0)
column_means = df.apply(np.mean)
# Calculate mean of each column using apply
print(column_means)
print("\n")

# Apply function to each row (axis=1)
row_sums = df.apply(np.sum, axis=1)
# Calculate sum of each row by specifying axis=1
print(row_sums)
print("\n")

# Apply different functions to different columns
def apply_different_ops(col):
    if col.name == 'A':
        return col * 2
    elif col.name == 'B':
        return col / 10
    else:
        return col

df_modified = df.apply(apply_different_ops)
# Apply different operations based on column name
print(df_modified)
print("\n")

   A   B    C
0  1  10  100
1  2  20  200
2  3  30  300
3  4  40  400


A      2.5
B     25.0
C    250.0
dtype: float64


0    111
1    222
2    333
3    444
dtype: int64


   A    B    C
0  2  1.0  100
1  4  2.0  200
2  6  3.0  300
3  8  4.0  400




In [None]:
# Create DataFrame with mixed data types
df_numeric = pd.DataFrame({
    'x': [1.1000, 2.452, 3.378, 4.674],
    'y': [5.56, 6.636, 7.7787887, 8.7688]
})
print(df_numeric)
print("\n")

# Apply function element-wise using applymap (deprecated in newer versions)
# Using apply with lambda for element-wise operations
df_rounded = df_numeric.apply(lambda x: x.round(2))
# Round all elements to 2 decimal place
print(df_rounded)
print("\n")

# Apply mathematical function to all elements
df_sqrt = df_numeric.apply(np.sqrt)
# Calculate square root of all elements in DataFrame
print(df_sqrt)
print("\n")

# Apply conditional function element-wise
df_conditional = df_numeric.apply(lambda x: np.where(x > 3, x * 2, x))
# Double values greater than 3, keep others unchanged
print(df_conditional)
print("\n")

       x         y
0  1.100  5.560000
1  2.452  6.636000
2  3.378  7.778789
3  4.674  8.768800


      x     y
0  1.10  5.56
1  2.45  6.64
2  3.38  7.78
3  4.67  8.77


          x         y
0  1.048809  2.357965
1  1.565886  2.576043
2  1.837934  2.789048
3  2.161944  2.961216


       x          y
0  1.100  11.120000
1  2.452  13.272000
2  6.756  15.557577
3  9.348  17.537600




**Using map() for Series**

In [None]:
# Create Series for mapping examples
s_categories = pd.Series(['A', 'B', 'C', 'A', 'B'])
print(s_categories)
print("\n")

# Map using dictionary
category_map = {'A': 'High', 'B': 'Medium', 'C': 'Low'}
s_mapped = s_categories.map(category_map)
# Replace category codes with descriptive labels using dictionary mapping
print(s_mapped)
print("\n")

# Map using function
def grade_to_score(grade):
    return {'A': 90, 'B': 80, 'C': 70}.get(grade, 0)

s_scores = s_categories.map(grade_to_score)
# Convert letter grades to numerical scores using function mapping
print(s_scores)
print("\n")

# Map with default value for unmapped items
s_with_default = s_categories.map({'A': 'Alpha', 'B': 'Beta'}, na_action=None)
# Map with missing values becoming NaN for unmapped items
print(s_with_default)
print("\n")

0    A
1    B
2    C
3    A
4    B
dtype: object


0      High
1    Medium
2       Low
3      High
4    Medium
dtype: object


0    90
1    80
2    70
3    90
4    80
dtype: int64


0    Alpha
1     Beta
2      NaN
3    Alpha
4     Beta
dtype: object




**Using replace() Method**

In [None]:
# Create DataFrame with values to replace
df_replace = pd.DataFrame({
    'status': ['active', 'inactive', 'pending', 'active'],
    'rating': [5, 3, 4, 5]
})
print(df_replace)
print("\n")

# Replace single value
df_single_replace = df_replace.replace('inactive', 'blocked')
# Replace single string value throughout DataFrame
print(df_single_replace)
print("\n")

# Replace multiple values using dictionary
df_multi_replace = df_replace.replace({
    'status': {'active': 'enabled', 'inactive': 'disabled'},
    'rating': {5: 'Excellent', 3: 'Poor', 4: 'Good'}
})
# Replace different values in different columns using nested dictionary
print(df_multi_replace)
print("\n")

# Replace using regular expressions
df_regex_replace = pd.DataFrame({'text': ['abc123', 'def456', 'ghi789']})
df_regex_replace['cleaned'] = df_regex_replace['text'].replace(r'\d+', 'NUM', regex=True)
# Replace numeric patterns with 'NUM' using regex
print(df_regex_replace)
print("\n")

     status  rating
0    active       5
1  inactive       3
2   pending       4
3    active       5


    status  rating
0   active       5
1  blocked       3
2  pending       4
3   active       5


     status     rating
0   enabled  Excellent
1  disabled       Poor
2   pending       Good
3   enabled  Excellent


     text cleaned
0  abc123  abcNUM
1  def456  defNUM
2  ghi789  ghiNUM




**Advanced Function Application with agg()**

In [None]:
# Create DataFrame for aggregation examples
df_sales = pd.DataFrame({
    'product': ['A', 'B', 'A', 'B', 'A', 'B'],
    'sales': [100, 150, 200, 120, 180, 90],
    'profit': [20, 30, 40, 25, 35, 18]
})
print(df_sales)
print("\n")

# Apply multiple aggregation functions
multi_agg = df_sales.agg({
    'sales': ['sum', 'mean', 'std'],
    'profit': ['min', 'max', 'count']
})
# Apply different aggregation functions to different columns
print(multi_agg)
print("\n")

# Apply custom aggregation function
def coefficient_of_variation(x):
    return x.std() / x.mean() if x.mean() != 0 else 0

cv_agg = df_sales.agg({
    'sales': coefficient_of_variation,
    'profit': coefficient_of_variation
})
# Apply custom coefficient of variation function to numeric columns
print(cv_agg)
print("\n")

# Apply same function to all columns
all_means = df_sales[['sales', 'profit']].agg('mean')
# Calculate mean for all specified numeric columns
print(all_means)
print("\n")

  product  sales  profit
0       A    100      20
1       B    150      30
2       A    200      40
3       B    120      25
4       A    180      35
5       B     90      18


            sales  profit
sum    840.000000     NaN
mean   140.000000     NaN
std     44.271887     NaN
min           NaN    18.0
max           NaN    40.0
count         NaN     6.0


sales     0.316228
profit    0.307226
dtype: float64


sales     140.0
profit     28.0
dtype: float64




**Transform**

In [None]:
# Create DataFrame for transform examples
df_transform = pd.DataFrame({
    'group': ['X', 'X', 'Y', 'Y', 'Z', 'Z'],
    'value': [10, 20, 30, 40, 50, 60]
})
print(df_transform)
print("\n")

# Apply transform to get group-wise operations
df_transform['group_mean'] = df_transform.groupby('group')['value'].transform('mean')
# Add column with mean value for each group using transform
print(df_transform)
print("\n")

# Apply custom transform function
def demean(x):
    return x - x.mean()

df_transform['demeaned'] = df_transform.groupby('group')['value'].transform(demean)
# Center values around group mean using custom transform function
print(df_transform)
print("\n")

# Transform with multiple operations
df_transform['normalized'] = df_transform.groupby('group')['value'].transform(
    lambda x: (x - x.min()) / (x.max() - x.min()) if x.max() != x.min() else 0
)
# Normalize values within each group to 0-1 range
print(df_transform)
print("\n")

  group  value
0     X     10
1     X     20
2     Y     30
3     Y     40
4     Z     50
5     Z     60


  group  value  group_mean
0     X     10        15.0
1     X     20        15.0
2     Y     30        35.0
3     Y     40        35.0
4     Z     50        55.0
5     Z     60        55.0


  group  value  group_mean  demeaned
0     X     10        15.0      -5.0
1     X     20        15.0       5.0
2     Y     30        35.0      -5.0
3     Y     40        35.0       5.0
4     Z     50        55.0      -5.0
5     Z     60        55.0       5.0


  group  value  group_mean  demeaned  normalized
0     X     10        15.0      -5.0         0.0
1     X     20        15.0       5.0         1.0
2     Y     30        35.0      -5.0         0.0
3     Y     40        35.0       5.0         1.0
4     Z     50        55.0      -5.0         0.0
5     Z     60        55.0       5.0         1.0




**Apply with Additional Arguments**

In [6]:
# Create DataFrame for examples with additional arguments
df_args = pd.DataFrame({
    'price': [100, 200, 300, 400],
    'quantity': [2, 3, 1, 4]
})
print(df_args)
print("\n")

# Apply function with additional positional arguments
def calculate_total(price, quantity, tax_rate):
    return price * quantity * (1 + tax_rate)


#Case 1 — apply on DataFrame rows
df_args['total_with_tax'] = df_args.apply(
    lambda row: calculate_total(row['price'], row['quantity'], 0.1), axis=1
)
# Calculate total with tax using apply with additional argument
print(df_args)
print("\n")

# Apply function with keyword arguments
def discount_price(price, discount=0.1, min_price=50):
    discounted = price * (1 - discount)
    return max(discounted, min_price)

#Case 2 — apply on a Series
df_args['discounted'] = df_args['price'].apply(
    discount_price, discount=0.15, min_price=80
)

# Apply discount function with keyword arguments
print(df_args)
print("\n")

   price  quantity
0    100         2
1    200         3
2    300         1
3    400         4


   price  quantity  total_with_tax
0    100         2           220.0
1    200         3           660.0
2    300         1           330.0
3    400         4          1760.0


   price  quantity  total_with_tax  discounted
0    100         2           220.0        85.0
1    200         3           660.0       170.0
2    300         1           330.0       255.0
3    400         4          1760.0       340.0




**Vectorized Operations vs Apply**

In [7]:
# Create large Series for performance comparison
large_s = pd.Series(np.random.randn(10000))

# Vectorized operation (fast)
vectorized_result = large_s ** 2 + large_s * 3
# Perform vectorized mathematical operation (much faster)
print(f"Vectorized operation completed on {len(large_s)} elements")
print("\n")

# Apply with lambda (slower)
apply_result = large_s.apply(lambda x: x ** 2 + x * 3)
# Same operation using apply (slower but more flexible)
print(f"Apply operation completed on {len(large_s)} elements")
print("\n")

# When to use apply: complex logic that can't be vectorized
def complex_logic(x):
    if x > 1:
        return x ** 2
    elif x < -1:
        return x ** 3
    else:
        return x

complex_result = large_s.apply(complex_logic)
# Use apply when logic is too complex for vectorization
print(f"Complex logic applied to {len(large_s)} elements")
print("\n")

Vectorized operation completed on 10000 elements


Apply operation completed on 10000 elements


Complex logic applied to 10000 elements




**Working with Missing Values in Function Application**

In [11]:
# Create Series with missing values
s_missing = pd.Series([1, 2, np.nan, 4, 5])
print(s_missing)
print("\n")

# Drop missing values
s_dropped = s_missing.dropna()
# Remove NaN values
print(s_dropped)
print("\n")

# Apply function that handles NaN
def safe_log(x):
    return np.log(x) if pd.notna(x) and x > 0 else np.nan

s_logged = s_missing.apply(safe_log)
# Apply logarithm function with NaN handling
print(s_logged)
print("\n")

# Use skipna parameter in aggregation
df_missing = pd.DataFrame({
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, 7, 8]
})

print(df_missing)
print("\n")

# Calculate mean without NaN values using pandas .mean()
mean_with_na = df_missing.mean(skipna=True)
print(mean_with_na)
print("\n")

# Apply function only to non-null values
s_non_null = s_missing.dropna().apply(lambda x: x * 10)
# Apply function only after removing NaN values
print(s_non_null)
print("\n")

0    1.0
1    2.0
2    NaN
3    4.0
4    5.0
dtype: float64


0    1.0
1    2.0
3    4.0
4    5.0
dtype: float64


0    0.000000
1    0.693147
2         NaN
3    1.386294
4    1.609438
dtype: float64


     A    B
0  1.0  5.0
1  2.0  NaN
2  NaN  7.0
3  4.0  8.0


A    2.333333
B    6.666667
dtype: float64


0    10.0
1    20.0
3    40.0
4    50.0
dtype: float64




**Function Application with GroupBy**

In [12]:
# Create DataFrame for groupby function application
df_groupby = pd.DataFrame({
    'category': ['A', 'A', 'B', 'B', 'C', 'C'],
    'value1': [10, 20, 30, 40, 50, 60],
    'value2': [1, 2, 3, 4, 5, 6]
})
print(df_groupby)
print("\n")

# Apply function to each group
def group_summary(group):
    return pd.Series({
        'sum_val1': group['value1'].sum(),
        'mean_val2': group['value2'].mean(),
        'count': len(group)
    })

group_results = df_groupby.groupby('category').apply(group_summary)
# Apply custom summary function to each group
print(group_results)
print("\n")

# Apply different functions to different columns per group
group_agg = df_groupby.groupby('category').agg({
    'value1': ['sum', 'max'],
    'value2': 'mean'
})
# Apply multiple aggregation functions per group and column
print(group_agg)
print("\n")

# Filter groups based on condition
def filter_groups(group):
    return group['value1'].sum() > 25

filtered_groups = df_groupby.groupby('category').filter(filter_groups)
# Keep only groups where sum of value1 exceeds 25
print(filtered_groups)
print("\n")

  category  value1  value2
0        A      10       1
1        A      20       2
2        B      30       3
3        B      40       4
4        C      50       5
5        C      60       6


          sum_val1  mean_val2  count
category                            
A             30.0        1.5    2.0
B             70.0        3.5    2.0
C            110.0        5.5    2.0


         value1     value2
            sum max   mean
category                  
A            30  20    1.5
B            70  40    3.5
C           110  60    5.5


  category  value1  value2
0        A      10       1
1        A      20       2
2        B      30       3
3        B      40       4
4        C      50       5
5        C      60       6




  group_results = df_groupby.groupby('category').apply(group_summary)
