In [23]:
# Welcome to Learning Pandas
# This script will include challenges designed to improve your skills in slicing and filtering data using Pandas.

# Let's start by importing the pandas library
import pandas as pd

# Sample dataset to get us started
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [24, 27, 22, 32, 29],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'],
    'Score': [85, 90, 78, 88, 95]
}

# Create a DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City,Score
0,Alice,24,New York,85
1,Bob,27,Los Angeles,90
2,Charlie,22,Chicago,78
3,David,32,Houston,88
4,Eva,29,Phoenix,95


In [2]:
# First Challenge: Display rows where 'Age' is greater than 25
df.loc[df['Age'] > 25]

Unnamed: 0,Name,Age,City,Score
1,Bob,27,Los Angeles,90
3,David,32,Houston,88
4,Eva,29,Phoenix,95


In [3]:
df[df['Age']>25]

Unnamed: 0,Name,Age,City,Score
1,Bob,27,Los Angeles,90
3,David,32,Houston,88
4,Eva,29,Phoenix,95


In [4]:
# Second Challenge: Display only the 'Name' and 'City' columns for rows where 'Score' is greater than 80
df[df['Score'] > 80][['Name', 'City']]

Unnamed: 0,Name,City
0,Alice,New York
1,Bob,Los Angeles
3,David,Houston
4,Eva,Phoenix


In [5]:
# Third Challenge: Display rows where 'City' is either 'Chicago' or 'Houston'
df[df['City'].isin(['Chicago', 'Houston'])]

Unnamed: 0,Name,Age,City,Score
2,Charlie,22,Chicago,78
3,David,32,Houston,88


In [None]:
#1. Using apply() with a Custom Function
#Create a function that:
#	•	Takes an age value and categorizes it into "Young" (≤25) or "Old" (>25).
#	•	Apply this function to the Age column and create a new column called "Age Group".
def categorize_age(age):
    if age <= 25:
        x = 'young'
    else:
        x = 'old'
    return x


df['Age Group'] = df['Age'].apply(categorize_age)
df

Unnamed: 0,Name,Age,City,Score,Age Group
0,Alice,24,New York,85,young
1,Bob,27,Los Angeles,90,old
2,Charlie,22,Chicago,78,young
3,David,32,Houston,88,old
4,Eva,29,Phoenix,95,old


In [None]:
# LAMBDA: df['column'].apply(lambda x: TRUE if x < 0 else FALSE)

#2. Using lambda in apply()

#Modify the Score column:
#	•	If the score is greater than 85, increase it by 5.
#	•	Otherwise, decrease it by 3.
#	•	Use apply() with a lambda function.

df['new_score'] = df['Score'].apply(lambda x: x + 5 if x > 85 else x - 3)
df

Unnamed: 0,Name,Age,City,Score,Age Group,new_score
0,Alice,24,New York,85,young,82
1,Bob,27,Los Angeles,90,old,95
2,Charlie,22,Chicago,78,young,75
3,David,32,Houston,88,old,93
4,Eva,29,Phoenix,95,old,100


In [None]:
#3. Using groupby() for Aggregation

#Group the DataFrame by City and:
#	•	Find the average score for each city.
#	•	Count the number of people in each city.

grouped_df = df.groupby('City')

aggregated_df = pd.DataFrame({'avg_score': grouped_df['Score'].mean(), 'count_ppl': grouped_df['Name'].size()})
aggregated_df

# also
aggregated_df = df.groupby("City").agg(avg_score=("Score", "mean"), count_ppl=("Name", "size"))

Unnamed: 0_level_0,avg_score,count_ppl
City,Unnamed: 1_level_1,Unnamed: 2_level_1
Chicago,78.0,1
Houston,88.0,1
Los Angeles,90.0,1
New York,85.0,1
Phoenix,95.0,1


In [None]:
#4. Using map() for Value Transformation

#Create a dictionary:
city_map = {"New York": "NY", "Los Angeles": "LA", "Chicago": "CHI", "Houston": "HOU", "Phoenix": "PHX"}

#	•	Use .map() to replace the City column values with their abbreviations.
replaced_df = df.copy()
replaced_df['City'] = replaced_df['City'].map(city_map)
replaced_df


Unnamed: 0,Name,Age,City,Score
0,Alice,24,NY,85
1,Bob,27,LA,90
2,Charlie,22,CHI,78
3,David,32,HOU,88
4,Eva,29,PHX,95


In [28]:
#5. Using filter() for Column Selection
#	•	Use .filter() to return only columns that contain the letter “o” in their names.

df.filter(axis=1, regex='o')

Unnamed: 0,Score
0,85
1,90
2,78
3,88
4,95


In [29]:
#6. Using query() for Filtering
#	•	Use .query() to select rows where Age is greater than 25 and Score is above 80.

df.query("Age > 25 and Score > 80")

Unnamed: 0,Name,Age,City,Score
1,Bob,27,Los Angeles,90
3,David,32,Houston,88
4,Eva,29,Phoenix,95


In [30]:
#7. Using pivot_table()

#Create a pivot table where:
#	•	The index is "City",
#	•	The values are "Score",
#	•	The aggregation function is "mean".
pivot_df = df.pivot_table(index='City', values='Score', aggfunc='mean')
pivot_df

Unnamed: 0_level_0,Score
City,Unnamed: 1_level_1
Chicago,78.0
Houston,88.0
Los Angeles,90.0
New York,85.0
Phoenix,95.0


In [31]:
#8. Using .transform() for Column-Wide Operations
#	•	Use .groupby("City")["Score"].transform("mean") to calculate each person’s city’s average score and store it in a new column "City Avg Score".

df['City Avg Score'] = df.groupby('City')['Score'].transform('mean')
df

Unnamed: 0,Name,Age,City,Score,City Avg Score
0,Alice,24,New York,85,85.0
1,Bob,27,Los Angeles,90,90.0
2,Charlie,22,Chicago,78,78.0
3,David,32,Houston,88,88.0
4,Eva,29,Phoenix,95,95.0


In [35]:
#9. Using .explode() on Lists in a Column

#Modify the DataFrame by adding a Subjects column:
df["Subjects"] = [["Math", "Science"], ["History"], ["Science", "Art"], ["Math", "English"], ["History", "English"]]
#	•	Use .explode() to transform this column so each subject has its own row.
exploded_df = df.explode('Subjects')
exploded_df

Unnamed: 0,Name,Age,City,Score,City Avg Score,Subjects
0,Alice,24,New York,85,85.0,Math
0,Alice,24,New York,85,85.0,Science
1,Bob,27,Los Angeles,90,90.0,History
2,Charlie,22,Chicago,78,78.0,Science
2,Charlie,22,Chicago,78,78.0,Art
3,David,32,Houston,88,88.0,Math
3,David,32,Houston,88,88.0,English
4,Eva,29,Phoenix,95,95.0,History
4,Eva,29,Phoenix,95,95.0,English


In [38]:
#10. Using .melt() for Reshaping

#Convert the DataFrame from wide format to long format where:
#	•	"Name" remains as an identifier,
#	•	"Age" and "Score" are converted into a single column with their corresponding values.

melted_df = pd.melt(exploded_df, id_vars='Name', value_vars=['Age', 'Score'])
melted_df

Unnamed: 0,Name,variable,value
0,Alice,Age,24
1,Alice,Age,24
2,Bob,Age,27
3,Charlie,Age,22
4,Charlie,Age,22
5,David,Age,32
6,David,Age,32
7,Eva,Age,29
8,Eva,Age,29
9,Alice,Score,85


In [None]:
import pandas as pd

# Sample dataset
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [24, 27, 22, 32, 29],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'],
    'Score': [85, 90, 78, 88, 95]
}

df = pd.DataFrame(data)

# 1. Apply a function to categorize age
# Write a function that categorizes 'Age' as 'Young' if <= 25, else 'Old'.
# Apply it to the 'Age' column and store the result in a new column 'Age Group'.
def categorise_age(age):
    if age <= 25:
        return 'young'
    else:
        return 'old'
df['Age Group'] = df['Age'].apply(categorise_age)

# 2. Modify scores using lambda
# Use apply() with a lambda function to increase Score by 5 if > 85, otherwise decrease by 3.
df['New Score'] = df['Score'].apply(lambda x: x+5 if x>85 else x-3)

# 3. Group by City and aggregate avg score and count of people
# Group by 'City' and calculate the mean 'Score' and count of people per city.
grouped_df = df.groupby('City').agg(avg_score=('Score', 'mean'), count_ppl=('Name', 'size'))


# 4. Replace City names with abbreviations
# Use a dictionary to replace full city names with their abbreviations.
city_dict = {'New York':'NY', 'Los Angeles':'LA', 'Chicago':'CH', 'Houston':'HO', 'Phoenix':'PH'}
df['City Abbr.'] = df['City'].map(city_dict)

# 5. Filter columns containing letter 'o'
# Use .filter() to return only columns that contain the letter 'o'.
df.filter(axis=1, regex='o')

# 6. Query rows where Age > 25 and Score > 80
# Use .query() to filter rows where 'Age' is greater than 25 and 'Score' is greater than 80.
df.query('Age > 25 and Score > 80')

# 7. Create a pivot table with City as index and Score mean
# Use pivot_table() to get the mean Score per City.
df.pivot_table(index='City', values='Score', aggfunc='mean')

# 8. Transform to get city-wide average scores for each row
# Use .groupby() with .transform() to add a new column with the average Score per City.
df['Average Score'] = df.groupby('City')['Score'].transform('mean')

# 9. Explode a list column (Subjects) into multiple rows
# Add a 'Subjects' column with lists of subjects and use .explode() to expand it.
df["Subjects"] = [["Math", "Science"], ["History"], ["Science", "Art"], ["Math", "English"], ["History", "English"]]
df = df.explode('Subjects')

# 10. Melt the dataframe from wide to long format
# Use .melt() to reshape the dataframe so 'Age' and 'Score' become a single variable column.
df = df.melt(id_vars='Name', value_vars=['Age', 'Score'])
df

# 11. Merge two DataFrames
# Create an additional DataFrame with 'Name' and 'Department' columns, then merge it with df.
dept = [{'Name': 'Alice', 'Department': 'Finance'},
        {'Name': 'Bob', 'Department': 'HR'},
        {'Name': 'Charlie', 'Department': 'Operations'},
        {'Name': 'David', 'Department': 'Sales'},
        {'Name': 'Eva', 'Department': 'IT'}
        ]
dept_df = pd.DataFrame(dept)
merged_df = pd.merge(df, dept_df, on='Name')
merged_df

# 12. Fill missing values in Score with column mean
# Use .fillna() to replace missing values in the 'Score' column with its mean.
some_dict = [{'A': 'John', 'Score': 3}, {'A': 'Alice', 'Score': 2}, {'A': 'Linda', 'Score': None}]
scores = pd.DataFrame(some_dict)
fill_value = scores['Score'].mean()
scores.fillna(fill_value)


Unnamed: 0,A,Score
0,John,3.0
1,Alice,2.0
2,Linda,2.5


In [145]:
import pandas as pd

# New dataset: Online Store Transactions
data = {
    'TransactionID': [101, 102, 103, 104, 105, 106],
    'Customer': ['Alice', 'Bob', 'Charlie', 'Alice', 'David', 'Eva'],
    'Category': ['Electronics', 'Clothing', 'Electronics', 'Books', 'Clothing', 'Books'],
    'Amount': [250, 80, 120, 30, 60, 15],
    'Discount': [10, 5, 15, 0, 5, 0],
    'Date': ['2023-06-01', '2023-06-02', '2023-06-03', '2023-06-01', '2023-06-04', '2023-06-05']
}

df = pd.DataFrame(data)

# 1. Calculate the net amount (Amount - Discount) and store it in a new column 'Net Amount'
df['Net Amount'] = df['Amount'] - df['Discount']

# 2. Apply a transformation to classify transactions as 'High' if Amount > 100, else 'Low'. Store in 'Transaction Size'.
def categorise_amount(amount):
    if amount > 100:
        return 'High'
    else:
        return 'Low' 
df['Transaction size'] = df['Amount'].apply(categorise_amount)

# 3. Group by 'Customer' and calculate the total amount spent per customer.
grouped_df = df.groupby('Customer').agg(Total=('Amount','sum'))

# 4. Compute the mean transaction amount per category using groupby.
mean_df = df.groupby('Category').agg(Mean=('Amount','mean'))
mean_df

# 5. Create a new column 'Discounted' which is True if Discount > 0, otherwise False.
df['Discounted'] = df['Discount'].apply(lambda x: True if x > 0 else False)

# 6. Filter rows where transactions happened in June and the net amount is greater than 50.
df['Date'] = pd.to_datetime(df['Date'])  # Convert once at the start
filtered_df = df[(df['Date'].dt.month == 6) & (df['Net Amount'] > 50)]

# 7. Create a pivot table with 'Category' as index and aggregate the sum of 'Amount' and mean of 'Discount'.
pivoted_df = df.pivot_table(index='Category', values=['Amount', 'Discount'], aggfunc={'Amount':'sum', 'Discount':'mean'})

# 8. Use transform() to add a new column 'Customer Avg Spend' that contains the average amount spent by each customer.
df['Customer Avg Spend'] = df.groupby('Customer')['Amount'].transform('mean')

# 9. Use apply() with a lambda function to convert 'Date' to datetime and extract the month as a new column 'Month'.
df['Month'] = df['Date'].apply(pd.to_datetime).apply(lambda x: x.month)

# 10. Merge this dataframe with another DataFrame containing customer age info and display the combined data.
age_df = pd.DataFrame([
    {'Customer': 'Alice', 'Age': 21},
    {'Customer': 'Bob', 'Age': 45},
    {'Customer': 'Charlie', 'Age': 46},
    {'Customer': 'David', 'Age': 8},
    {'Customer': 'Eva', 'Age': 13}
])
merged_df = pd.merge(df, age_df, on='Customer')
merged_df

# 11. Fill any missing values in 'Discount' with the overall average discount.
import numpy as np
df['Discount'] = df['Discount'].replace(0, np.nan)  # Replace zeros with NaN
df['Discount Filled'] = df['Discount'].fillna(df['Discount'].mean())

# 12. Sort the dataframe by 'Amount' in descending order and reset the index.
sorted_df = df.sort_values(by='Amount', ascending=False)
sorted_df = sorted_df.reset_index(drop=True)


In [None]:
import pandas as pd
import numpy as np

# New dataset: Employee Performance and Sales
data = {
    'Employee': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Alice', 'Charlie', 'Bob', 'Eva', 'David'],
    'Department': ['Sales', 'HR', 'IT', 'Sales', 'HR', 'Sales', 'IT', 'HR', 'HR', 'Sales'],
    'Sales': [300, np.nan, 150, 500, 120, 450, 200, np.nan, 180, 600],
    'Bonus': [30, 20, 25, 50, 10, 45, 30, 15, 12, 55],
    'Performance Score': [85, 78, 90, 95, 70, 88, 92, 80, 74, 99]
}

df = pd.DataFrame(data)

# 1. Group employees by 'Department' and calculate the total sales per department.

# 2. Use transform() to create a new column 'Department Avg Sales' that gives the average sales per department.

# 3. Use apply() to standardize 'Performance Score' by subtracting the mean and dividing by standard deviation.

# 4. Use filter() to select only columns containing the word 'Score'.

# 5. Use map() to replace department names with abbreviations (e.g., {'Sales': 'S', 'HR': 'H', 'IT': 'I'}).

# 6. Group by 'Employee' and count the number of sales transactions per employee.

# 7. Use transform() to create a column 'Relative Performance' where each employee’s score is divided by the max score in their department.

# 8. Use apply() with a lambda function to categorize 'Performance Score' as 'Excellent' (>=90), 'Good' (80-89), or 'Needs Improvement' (<80).

# 9. Filter rows where 'Sales' is greater than the department's average sales.

# 10. Create a pivot table showing the sum of 'Sales' and the mean 'Bonus' per 'Department'.

# 11. Merge df with another DataFrame containing Employee tenure in years.

# 12. Fill missing values in 'Sales' with the department's average sales.
