 # Filter Rows and Columns in Pandas Using Query and isin 

# 1. Basic Examples with isin()

In [2]:
import pandas as pd
import numpy as np

# Create sample dataset
data = {
    'employee_id': [101, 102, 103, 104, 105, 106, 107, 108, 109, 110],
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Henry', 'Ivy', 'Jack'],
    'department': ['IT', 'HR', 'Finance', 'IT', 'Marketing', 'Finance', 'HR', 'IT', 'Marketing', 'Finance'],
    'position': ['Developer', 'Manager', 'Analyst', 'Developer', 'Specialist', 'Manager', 'Analyst', 'Developer', 'Specialist', 'Analyst'],
    'salary': [75000, 85000, 65000, 78000, 60000, 90000, 62000, 82000, 58000, 67000],
    'experience': [3, 7, 2, 4, 1, 8, 2, 5, 1, 3],
    'location': ['NYC', 'LA', 'Chicago', 'NYC', 'LA', 'Chicago', 'NYC', 'LA', 'Chicago', 'NYC'],
    'performance': ['Excellent', 'Good', 'Average', 'Excellent', 'Good', 'Excellent', 'Average', 'Good', 'Average', 'Good']
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print("\n" + "="*80)

Original DataFrame:
   employee_id     name department    position  salary  experience location  \
0          101    Alice         IT   Developer   75000           3      NYC   
1          102      Bob         HR     Manager   85000           7       LA   
2          103  Charlie    Finance     Analyst   65000           2  Chicago   
3          104    David         IT   Developer   78000           4      NYC   
4          105      Eva  Marketing  Specialist   60000           1       LA   
5          106    Frank    Finance     Manager   90000           8  Chicago   
6          107    Grace         HR     Analyst   62000           2      NYC   
7          108    Henry         IT   Developer   82000           5       LA   
8          109      Ivy  Marketing  Specialist   58000           1  Chicago   
9          110     Jack    Finance     Analyst   67000           3      NYC   

  performance  
0   Excellent  
1        Good  
2     Average  
3   Excellent  
4        Good  
5   Excellent 

## Example 1.1: Basic isin() filtering

In [3]:
# Filter employees in specific departments
it_finance_employees = df[df['department'].isin(['IT', 'Finance'])]
print("1. Employees in IT or Finance departments:")
print(it_finance_employees)
print(f"Count: {len(it_finance_employees)} employees")

# Filter specific employee IDs
selected_ids = df[df['employee_id'].isin([101, 104, 107, 110])]
print("\n2. Employees with IDs 101, 104, 107, 110:")
print(selected_ids[['employee_id', 'name', 'department']])

# Filter based on multiple positions
management_positions = df[df['position'].isin(['Manager', 'Analyst'])]
print("\n3. Manager or Analyst positions:")
print(management_positions[['name', 'position', 'department']])

1. Employees in IT or Finance departments:
   employee_id     name department   position  salary  experience location  \
0          101    Alice         IT  Developer   75000           3      NYC   
2          103  Charlie    Finance    Analyst   65000           2  Chicago   
3          104    David         IT  Developer   78000           4      NYC   
5          106    Frank    Finance    Manager   90000           8  Chicago   
7          108    Henry         IT  Developer   82000           5       LA   
9          110     Jack    Finance    Analyst   67000           3      NYC   

  performance  
0   Excellent  
2     Average  
3   Excellent  
5   Excellent  
7        Good  
9        Good  
Count: 6 employees

2. Employees with IDs 101, 104, 107, 110:
   employee_id   name department
0          101  Alice         IT
3          104  David         IT
6          107  Grace         HR
9          110   Jack    Finance

3. Manager or Analyst positions:
      name position department
1     

## Example 1.3: isin() with multiple columns

In [4]:
# Filter based on department AND position combinations
dept_pos_filter = df[
    df['department'].isin(['IT', 'Finance']) & 
    df['position'].isin(['Developer', 'Analyst'])
]
print("IT/Finance department AND Developer/Analyst positions:")
print(dept_pos_filter[['name', 'department', 'position']])

IT/Finance department AND Developer/Analyst positions:
      name department   position
0    Alice         IT  Developer
2  Charlie    Finance    Analyst
3    David         IT  Developer
7    Henry         IT  Developer
9     Jack    Finance    Analyst


# 2. Basic Examples with query()
## Example 2.1: Simple query() filtering

In [5]:
# Basic numeric filtering
high_salary = df.query('salary > 80000')
print("1. Employees with salary > $80,000:")
print(high_salary)

# String filtering
it_employees = df.query('department == "IT"')
print("\n2. IT Department employees:")
print(it_employees)

# Multiple conditions
experienced_high_salary = df.query('experience > 5 and salary > 70000')
print("\n3. Experienced (>5 years) AND high salary (>$70,000):")
print(experienced_high_salary)

1. Employees with salary > $80,000:
   employee_id   name department   position  salary  experience location  \
1          102    Bob         HR    Manager   85000           7       LA   
5          106  Frank    Finance    Manager   90000           8  Chicago   
7          108  Henry         IT  Developer   82000           5       LA   

  performance  
1        Good  
5   Excellent  
7        Good  

2. IT Department employees:
   employee_id   name department   position  salary  experience location  \
0          101  Alice         IT  Developer   75000           3      NYC   
3          104  David         IT  Developer   78000           4      NYC   
7          108  Henry         IT  Developer   82000           5       LA   

  performance  
0   Excellent  
3   Excellent  
7        Good  

3. Experienced (>5 years) AND high salary (>$70,000):
   employee_id   name department position  salary  experience location  \
1          102    Bob         HR  Manager   85000           7       

# Example 2.2: query() with OR conditions

In [6]:
# OR conditions
it_or_finance = df.query('department == "IT" or department == "Finance"')
print("1. IT OR Finance department:")
print(it_or_finance[['name', 'department']])

# Multiple OR conditions
multiple_locations = df.query('location == "NYC" or location == "LA"')
print("\n2. NYC OR LA location:")
print(multiple_locations[['name', 'location']])

1. IT OR Finance department:
      name department
0    Alice         IT
2  Charlie    Finance
3    David         IT
5    Frank    Finance
7    Henry         IT
9     Jack    Finance

2. NYC OR LA location:
    name location
0  Alice      NYC
1    Bob       LA
3  David      NYC
4    Eva       LA
6  Grace      NYC
7  Henry       LA
9   Jack      NYC


# Example 2.3: query() with variables

In [7]:
# Using variables in query
min_salary = 70000
max_experience = 5
good_performance = 'Good'

result = df.query('salary >= @min_salary and experience <= @max_experience and performance == @good_performance')
print(f"Salary >= ${min_salary}, Experience <= {max_experience}, Performance == '{good_performance}':")
print(result)

Salary >= $70000, Experience <= 5, Performance == 'Good':
   employee_id   name department   position  salary  experience location  \
7          108  Henry         IT  Developer   82000           5       LA   

  performance  
7        Good  


# 3. Combining isin() and query()
## Example 3.1: Using isin() inside query()

In [8]:
# Create list of departments
tech_departments = ['IT', 'Finance']
high_performers = ['Excellent', 'Good']

# Using isin() inside query()
result = df.query('department in @tech_departments and performance in @high_performers')
print("Tech departments (IT/Finance) with high performance (Excellent/Good):")
print(result)

Tech departments (IT/Finance) with high performance (Excellent/Good):
   employee_id   name department   position  salary  experience location  \
0          101  Alice         IT  Developer   75000           3      NYC   
3          104  David         IT  Developer   78000           4      NYC   
5          106  Frank    Finance    Manager   90000           8  Chicago   
7          108  Henry         IT  Developer   82000           5       LA   
9          110   Jack    Finance    Analyst   67000           3      NYC   

  performance  
0   Excellent  
3   Excellent  
5   Excellent  
7        Good  
9        Good  


# Example 3.2: Complex combination

In [9]:
# Define filter criteria
high_salary_threshold = 75000
target_departments = ['IT', 'Finance', 'Marketing']
target_positions = ['Developer', 'Analyst', 'Specialist']

# Complex query with isin
complex_filter = df.query(
    'salary >= @high_salary_threshold and '
    'department in @target_departments and '
    'position in @target_positions'
)
print(f"Salary >= ${high_salary_threshold}, Target departments, Target positions:")
print(complex_filter)

Salary >= $75000, Target departments, Target positions:
   employee_id   name department   position  salary  experience location  \
0          101  Alice         IT  Developer   75000           3      NYC   
3          104  David         IT  Developer   78000           4      NYC   
7          108  Henry         IT  Developer   82000           5       LA   

  performance  
0   Excellent  
3   Excellent  
7        Good  


# 4. Real-World Business Examples

## Example 4.1: Employee Management System

In [10]:
# Create more comprehensive employee data
np.random.seed(42)
n_employees = 50

employee_data = {
    'emp_id': range(1000, 1000 + n_employees),
    'name': [f'Employee_{i}' for i in range(n_employees)],
    'department': np.random.choice(['IT', 'HR', 'Finance', 'Marketing', 'Operations', 'Sales'], n_employees),
    'grade': np.random.choice(['A', 'B', 'C', 'D'], n_employees, p=[0.2, 0.3, 0.4, 0.1]),
    'salary': np.random.randint(40000, 120000, n_employees),
    'years_exp': np.random.randint(1, 20, n_employees),
    'location': np.random.choice(['HQ', 'Branch1', 'Branch2', 'Remote'], n_employees),
    'project_count': np.random.randint(1, 10, n_employees),
    'last_perf_review': np.random.choice(['Exceeds', 'Meets', 'Below'], n_employees, p=[0.3, 0.5, 0.2])
}

employees = pd.DataFrame(employee_data)
print("Employee Dataset (first 10 rows):")
print(employees.head(10))
print(f"\nTotal employees: {len(employees)}")

# Business Query 1: High-potential employees for promotion
promotion_criteria = employees.query(
    'grade in ["A", "B"] and '
    'last_perf_review == "Exceeds" and '
    'years_exp >= 3 and '
    'project_count >= 5'
)
print("\n1. Promotion candidates (High grade, Exceeds expectations, Exp >= 3, Projects >= 5):")
print(promotion_criteria[['emp_id', 'name', 'department', 'grade', 'last_perf_review', 'years_exp']])
print(f"Count: {len(promotion_criteria)} employees")

# Business Query 2: Salary review for underpaid high performers
avg_salary_by_dept = employees.groupby('department')['salary'].mean().to_dict()
employees['dept_avg_salary'] = employees['department'].map(avg_salary_by_dept)

underpaid_high_performers = employees.query(
    'salary < dept_avg_salary and '
    'last_perf_review in ["Exceeds", "Meets"] and '
    'grade in ["A", "B"]'
)
print("\n2. Underpaid high performers (Salary < department average, Good performance, High grade):")
print(underpaid_high_performers[['name', 'department', 'salary', 'dept_avg_salary', 'last_perf_review']])

Employee Dataset (first 10 rows):
   emp_id        name  department grade  salary  years_exp location  \
0    1000  Employee_0   Marketing     C  117373          9   Remote   
1    1001  Employee_1  Operations     B  119575          5   Remote   
2    1002  Employee_2     Finance     A  103335          1  Branch2   
3    1003  Employee_3  Operations     B   50965         19  Branch1   
4    1004  Employee_4  Operations     B   64538         10   Remote   
5    1005  Employee_5          HR     C  110592         12       HQ   
6    1006  Employee_6     Finance     C   48110         15  Branch2   
7    1007  Employee_7     Finance     C  119309          9   Remote   
8    1008  Employee_8     Finance     A   67266         17       HQ   
9    1009  Employee_9  Operations     B   92992         17       HQ   

   project_count last_perf_review  
0              1            Meets  
1              8            Below  
2              4            Meets  
3              6            Meets  
4   

# Example 4.2: Sales Data Analysis

In [11]:
# Create sales dataset
np.random.seed(42)
n_sales = 100

sales_data = {
    'sale_id': range(1000, 1000 + n_sales),
    'product_id': np.random.choice(['P001', 'P002', 'P003', 'P004', 'P005', 'P006'], n_sales),
    'category': np.random.choice(['Electronics', 'Clothing', 'Home', 'Books', 'Toys'], n_sales),
    'region': np.random.choice(['North', 'South', 'East', 'West'], n_sales),
    'salesperson': np.random.choice(['John', 'Sarah', 'Mike', 'Emma', 'David'], n_sales),
    'quantity': np.random.randint(1, 10, n_sales),
    'unit_price': np.random.uniform(10, 500, n_sales),
    'customer_type': np.random.choice(['New', 'Returning', 'VIP'], n_sales, p=[0.4, 0.5, 0.1]),
    'sale_date': pd.date_range('2024-01-01', periods=n_sales, freq='D')
}

sales = pd.DataFrame(sales_data)
sales['total_amount'] = sales['quantity'] * sales['unit_price']
sales['month'] = sales['sale_date'].dt.month_name()
sales['quarter'] = sales['sale_date'].dt.quarter

print("Sales Dataset (first 10 rows):")
print(sales.head(10))

# Sales Query 1: High-value products in specific regions
high_value_products = ['P001', 'P003', 'P005']
target_regions = ['North', 'West']

high_value_sales = sales.query(
    'product_id in @high_value_products and '
    'region in @target_regions and '
    'total_amount > 1000'
)
print("\n1. High-value products in North/West region with total > $1000:")
print(high_value_sales[['sale_date', 'product_id', 'region', 'total_amount', 'salesperson']])

# Sales Query 2: VIP and Returning customers in specific categories
premium_categories = ['Electronics', 'Home']
premium_customers = sales.query(
    'customer_type in ["VIP", "Returning"] and '
    'category in @premium_categories'
)
print("\n2. VIP/Returning customers in Electronics/Home categories:")
print(premium_customers.groupby(['category', 'customer_type'])['total_amount'].sum())

Sales Dataset (first 10 rows):
   sale_id product_id     category region salesperson  quantity  unit_price  \
0     1000       P004         Toys  South        Mike         2  349.516276   
1     1001       P005  Electronics  South        Emma         7  180.684936   
2     1002       P003  Electronics   West        John         6  468.957594   
3     1003       P005  Electronics  South        Emma         3   29.201300   
4     1004       P005  Electronics   East        Mike         9  214.793556   
5     1005       P002        Books  North        John         6  484.114470   
6     1006       P003         Home   West        Emma         6  278.506223   
7     1007       P003         Home  North        Emma         1  217.500762   
8     1008       P003  Electronics  North        Mike         4  288.574939   
9     1009       P005         Home   East        John         6  292.203042   

  customer_type  sale_date  total_amount    month  quarter  
0           New 2024-01-01    699.0325

# 5. Advanced Techniques

In [12]:
def dynamic_filter(df, column, values, exclude=False):
    """Dynamic filter using isin"""
    if exclude:
        return df[~df[column].isin(values)]
    else:
        return df[df[column].isin(values)]

# Usage
print("Dynamic filtering function example:")
selected_depts = dynamic_filter(df, 'department', ['IT', 'Finance'])
print("IT/Finance departments:")
print(selected_depts[['name', 'department']])

excluded_depts = dynamic_filter(df, 'department', ['HR'], exclude=True)
print("\nExcluding HR department:")
print(excluded_depts[['name', 'department']])

Dynamic filtering function example:
IT/Finance departments:
      name department
0    Alice         IT
2  Charlie    Finance
3    David         IT
5    Frank    Finance
7    Henry         IT
9     Jack    Finance

Excluding HR department:
      name department
0    Alice         IT
2  Charlie    Finance
3    David         IT
4      Eva  Marketing
5    Frank    Finance
7    Henry         IT
8      Ivy  Marketing
9     Jack    Finance


# Example 5.2: Chained Filtering

In [13]:
# Chain multiple filters
filtered_data = (
    df
    .query('salary > 60000')
    .query('experience >= 3')
    .query('department in ["IT", "Finance"]')
)
print("Chained filtering: Salary > 60000, Experience >= 3, IT/Finance departments:")
print(filtered_data)

Chained filtering: Salary > 60000, Experience >= 3, IT/Finance departments:
   employee_id   name department   position  salary  experience location  \
0          101  Alice         IT  Developer   75000           3      NYC   
3          104  David         IT  Developer   78000           4      NYC   
5          106  Frank    Finance    Manager   90000           8  Chicago   
7          108  Henry         IT  Developer   82000           5       LA   
9          110   Jack    Finance    Analyst   67000           3      NYC   

  performance  
0   Excellent  
3   Excellent  
5   Excellent  
7        Good  
9        Good  


# Example 5.3: Filtering with Index

In [14]:
# Set employee_id as index
df_indexed = df.set_index('employee_id')

# Query with index
result = df_indexed.query('100 < index < 108 and department == "IT"')
print("Employees with ID between 100-108 in IT department:")
print(result)

Employees with ID between 100-108 in IT department:
              name department   position  salary  experience location  \
employee_id                                                             
101          Alice         IT  Developer   75000           3      NYC   
104          David         IT  Developer   78000           4      NYC   

            performance  
employee_id              
101           Excellent  
104           Excellent  


# 6. Performance Comparison

In [15]:
import time

# Create large dataset
large_df = pd.DataFrame({
    'A': np.random.randn(100000),
    'B': np.random.randn(100000),
    'C': np.random.choice(['X', 'Y', 'Z', 'W'], 100000),
    'D': np.random.randint(1, 100, 100000)
})

print("Performance Comparison (100,000 rows):")

# Method 1: Direct boolean indexing with isin
start = time.time()
result1 = large_df[large_df['C'].isin(['X', 'Y'])]
time1 = time.time() - start
print(f"1. Direct boolean + isin: {time1:.4f} seconds")

# Method 2: query() with isin
start = time.time()
result2 = large_df.query('C in ["X", "Y"]')
time2 = time.time() - start
print(f"2. query() with isin: {time2:.4f} seconds")

# Method 3: Multiple OR conditions
start = time.time()
result3 = large_df.query('C == "X" or C == "Y"')
time3 = time.time() - start
print(f"3. query() with OR: {time3:.4f} seconds")

Performance Comparison (100,000 rows):
1. Direct boolean + isin: 0.0054 seconds
2. query() with isin: 0.0154 seconds
3. query() with OR: 0.0095 seconds


# Example 7.1: Student Grade Analysis

In [16]:
# Student dataset
students = pd.DataFrame({
    'student_id': ['S001', 'S002', 'S003', 'S004', 'S005', 'S006', 'S007', 'S008'],
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Evan', 'Fiona', 'George', 'Hannah'],
    'major': ['CS', 'Math', 'Physics', 'CS', 'Math', 'Physics', 'CS', 'Math'],
    'gpa': [3.8, 3.2, 3.9, 3.5, 3.1, 3.7, 3.4, 3.6],
    'courses': ['CS101,CS102,MATH201', 'MATH201,PHYS101', 'PHYS101,PHYS102', 
                'CS101,CS103', 'MATH201,MATH202', 'PHYS101,CS101', 
                'CS102,CS103', 'MATH202,PHYS101'],
    'graduation_year': [2024, 2024, 2025, 2024, 2025, 2024, 2025, 2024]
})

# Find students taking specific courses
cs_courses = ['CS101', 'CS102']
cs_students = students[students['courses'].str.contains('|'.join(cs_courses))]
print("Students taking CS courses:")
print(cs_students)

# Query for honor students
honor_students = students.query('gpa >= 3.5 and graduation_year == 2024')
print("\nHonor students graduating in 2024:")
print(honor_students)

Students taking CS courses:
  student_id    name    major  gpa              courses  graduation_year
0       S001   Alice       CS  3.8  CS101,CS102,MATH201             2024
3       S004   Diana       CS  3.5          CS101,CS103             2024
5       S006   Fiona  Physics  3.7        PHYS101,CS101             2024
6       S007  George       CS  3.4          CS102,CS103             2025

Honor students graduating in 2024:
  student_id    name    major  gpa              courses  graduation_year
0       S001   Alice       CS  3.8  CS101,CS102,MATH201             2024
3       S004   Diana       CS  3.5          CS101,CS103             2024
5       S006   Fiona  Physics  3.7        PHYS101,CS101             2024
7       S008  Hannah     Math  3.6      MATH202,PHYS101             2024


# Example 7.2: Inventory Management

In [17]:
# Inventory dataset
inventory = pd.DataFrame({
    'product_id': ['P100', 'P101', 'P102', 'P103', 'P104', 'P105', 'P106', 'P107'],
    'product_name': ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Printer', 'Tablet', 'Phone', 'Headphones'],
    'category': ['Electronics', 'Accessories', 'Accessories', 'Electronics', 
                 'Office', 'Electronics', 'Electronics', 'Accessories'],
    'quantity': [15, 100, 75, 25, 30, 40, 60, 120],
    'price': [999.99, 29.99, 79.99, 299.99, 199.99, 499.99, 799.99, 149.99],
    'reorder_level': [10, 50, 40, 15, 20, 25, 30, 60],
    'supplier': ['Supplier_A', 'Supplier_B', 'Supplier_B', 'Supplier_A',
                 'Supplier_C', 'Supplier_A', 'Supplier_D', 'Supplier_B']
})

# Items needing reorder
need_reorder = inventory.query('quantity <= reorder_level')
print("Items needing reorder:")
print(need_reorder[['product_name', 'category', 'quantity', 'reorder_level']])

# High-value electronics
high_value_electronics = inventory.query(
    'category == "Electronics" and price > 300'
)
print("\nHigh-value electronics (price > $300):")
print(high_value_electronics[['product_name', 'price', 'quantity']])

Items needing reorder:
Empty DataFrame
Columns: [product_name, category, quantity, reorder_level]
Index: []

High-value electronics (price > $300):
  product_name   price  quantity
0       Laptop  999.99        15
5       Tablet  499.99        40
6        Phone  799.99        60


# 8. Tips and Best Practices

In [18]:
print("="*80)
print("BEST PRACTICES FOR query() AND isin()")
print("="*80)

tips = [
    "1. Use query() for cleaner syntax with complex conditions",
    "2. Use isin() instead of multiple OR conditions for better readability",
    "3. Pre-declare variables to use in query() with @ symbol",
    "4. Combine query() and isin() for maximum flexibility",
    "5. For performance, pre-filter with query() before using isin() on large datasets",
    "6. Use parentheses properly in query() for complex logical operations",
    "7. Remember that query() uses Python syntax, not pandas syntax",
    "8. isin() works with Series, lists, and other iterables",
    "9. Use ~ operator with isin() for NOT IN functionality",
    "10. Test queries on small subsets before applying to entire dataset"
]

for tip in tips:
    print(tip)

print("\n" + "="*80)
print("COMMON PITFALLS TO AVOID")
print("="*80)
pitfalls = [
    "✗ Forgetting parentheses in complex conditions: df.query('A > 0 & B > 0')",
    "✓ Correct: df.query('(A > 0) & (B > 0)')",
    "",
    "✗ Using pandas syntax in query(): df.query('df[\"A\"] > 0')",
    "✓ Correct: df.query('A > 0')",
    "",
    "✗ Not using @ for variables: df.query('salary > min_salary')",
    "✓ Correct: df.query('salary > @min_salary')",
    "",
    "✗ Using isin() with wrong brackets: df['dept'].isin('IT', 'HR')",
    "✓ Correct: df['dept'].isin(['IT', 'HR'])"
]

for pitfall in pitfalls:
    print(pitfall)

BEST PRACTICES FOR query() AND isin()
1. Use query() for cleaner syntax with complex conditions
2. Use isin() instead of multiple OR conditions for better readability
3. Pre-declare variables to use in query() with @ symbol
4. Combine query() and isin() for maximum flexibility
5. For performance, pre-filter with query() before using isin() on large datasets
6. Use parentheses properly in query() for complex logical operations
7. Remember that query() uses Python syntax, not pandas syntax
8. isin() works with Series, lists, and other iterables
9. Use ~ operator with isin() for NOT IN functionality
10. Test queries on small subsets before applying to entire dataset

COMMON PITFALLS TO AVOID
✗ Forgetting parentheses in complex conditions: df.query('A > 0 & B > 0')
✓ Correct: df.query('(A > 0) & (B > 0)')

✗ Using pandas syntax in query(): df.query('df["A"] > 0')
✓ Correct: df.query('A > 0')

✗ Not using @ for variables: df.query('salary > min_salary')
✓ Correct: df.query('salary > @min_sa