In [1]:
import pandas as pd
import numpy as np

# 1. Creating and Manipulating DataFrames
# Creating the student_data DataFrame
student_data = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [23, 25, 22, 24, 23],
    'Gender': ['F', 'M', 'M', 'M', 'F'],
    'Score': [85, 78, 92, 70, 88]
})

In [3]:
print(student_data.head(3))

      Name  Age Gender  Score
0    Alice   23      F     85
1      Bob   25      M     78
2  Charlie   22      M     92


In [5]:
gender_f = student_data[student_data['Gender'] == 'F']
print(gender_f)

    Name  Age Gender  Score
0  Alice   23      F     85
4    Eva   23      F     88


In [7]:
student_data['Score1'] = student_data['Score']
student_data['Score2'] = student_data['Score']


In [9]:
student_data['Age1'] = student_data['Age']


In [11]:
print(student_data.head(3))

      Name  Age Gender  Score  Score1  Score2  Age1
0    Alice   23      F     85      85      85    23
1      Bob   25      M     78      78      78    25
2  Charlie   22      M     92      92      92    22


In [13]:
student_data = student_data.drop(columns=['Age1'])

In [15]:
print(student_data.head(3))


      Name  Age Gender  Score  Score1  Score2
0    Alice   23      F     85      85      85
1      Bob   25      M     78      78      78
2  Charlie   22      M     92      92      92


In [17]:
student_data = student_data.drop(columns=['Score1', 'Score2'])

In [19]:
print(student_data.head(3))


      Name  Age Gender  Score
0    Alice   23      F     85
1      Bob   25      M     78
2  Charlie   22      M     92


In [23]:
student_data = student_data.drop(index=3)

In [27]:
print(student_data.head(5))


      Name  Age Gender  Score
0    Alice   23      F     85
1      Bob   25      M     78
2  Charlie   22      M     92
4      Eva   23      F     88


In [29]:
gender_f_high_score = student_data[(student_data['Gender'] == 'F') & (student_data['Score'] > 85)]
print(gender_f_high_score)

  Name  Age Gender  Score
4  Eva   23      F     88


In [31]:
student_data_sorted = student_data.sort_values(by='Score', ascending=False)
print(student_data_sorted)


      Name  Age Gender  Score
2  Charlie   22      M     92
4      Eva   23      F     88
0    Alice   23      F     85
1      Bob   25      M     78


In [33]:
def assign_grade(score):
    if score >= 90:
        return 'A'
    elif 80 <= score < 90:
        return 'B'
    elif 70 <= score < 80:
        return 'C'
    else:
        return 'D'

student_data['Grade'] = student_data['Score'].apply(assign_grade)
print(student_data)


      Name  Age Gender  Score Grade
0    Alice   23      F     85     B
1      Bob   25      M     78     C
2  Charlie   22      M     92     A
4      Eva   23      F     88     B


In [35]:
# 2. DataFrame Operations
# Creating the sales_data DataFrame
sales_data = pd.DataFrame({
    'Month': ['Jan', 'Feb', 'Mar', 'Jan', 'Feb', 'Mar'],
    'Product': ['A', 'A', 'A', 'B', 'B', 'B'],
    'Sales': [150, 200, 250, 300, 400, 500]
})

In [37]:
total_sales = sales_data.groupby('Product')['Sales'].sum()
print(total_sales)

Product
A     600
B    1200
Name: Sales, dtype: int64


In [39]:
highest_sales = sales_data.loc[sales_data.groupby('Product')['Sales'].idxmax()]
print(highest_sales)

  Month Product  Sales
2   Mar       A    250
5   Mar       B    500


In [45]:
sales_data['Sales_Percentage'] = sales_data['Sales'] / sales_data.groupby('Product')['Sales'].transform('sum') * 100


In [47]:
print(sales_data)

  Month Product  Sales  Sales_Percentage
0   Jan       A    150         25.000000
1   Feb       A    200         33.333333
2   Mar       A    250         41.666667
3   Jan       B    300         25.000000
4   Feb       B    400         33.333333
5   Mar       B    500         41.666667


In [49]:
# 3. Handling Missing Data
# Creating the employee_data DataFrame
employee_data = pd.DataFrame({
    'Name': ['John', 'Doe', 'Jane', 'Anna', 'Smith'],
    'Department': ['HR', 'Finance', None, 'IT', 'HR'],
    'Salary': [50000, 60000, 55000, None, 58000]
})


In [51]:
missing_values = employee_data.isnull().sum()
print(missing_values)

Name          0
Department    1
Salary        1
dtype: int64


In [55]:
employee_data['Department'].fillna(employee_data['Department'].mode()[0], inplace=True)


In [59]:
print(employee_data)

    Name Department   Salary
0   John         HR  50000.0
1    Doe    Finance  60000.0
2   Jane         HR  55000.0
3   Anna         IT  55750.0
4  Smith         HR  58000.0


In [67]:
employee_data['Department'].fillna(employee_data['Department'].mode()[0], inplace=True)


In [69]:
print(employee_data)

    Name Department   Salary
0   John         HR  50000.0
1    Doe    Finance  60000.0
2   Jane         HR  55000.0
3   Anna         IT  55750.0
4  Smith         HR  58000.0


In [71]:
employee_data = employee_data.dropna()
print(employee_data)

    Name Department   Salary
0   John         HR  50000.0
1    Doe    Finance  60000.0
2   Jane         HR  55000.0
3   Anna         IT  55750.0
4  Smith         HR  58000.0


In [73]:
# 4. Series Operations
# Creating the temperature Series
temperature = pd.Series([23, 21, 20, 25, 27, 30, 28, 22, 24, 26])

mean_temp = temperature.mean()
median_temp = temperature.median()
std_temp = temperature.std()

print(f"Mean: {mean_temp}, Median: {median_temp}, Standard Deviation: {std_temp}")

Mean: 24.6, Median: 24.5, Standard Deviation: 3.204163957519444


In [75]:
temperature_celsius = (temperature - 32) * 5/9
print(temperature_celsius)

0   -5.000000
1   -6.111111
2   -6.666667
3   -3.888889
4   -2.777778
5   -1.111111
6   -2.222222
7   -5.555556
8   -4.444444
9   -3.333333
dtype: float64


In [77]:
max_index = temperature.idxmax()
min_index = temperature.idxmin()


In [79]:
print(f"Max index: {max_index}, Min index: {min_index}")

Max index: 5, Min index: 2


In [81]:
temperature_sorted_asc = temperature.sort_values()
temperature_sorted_desc = temperature.sort_values(ascending=False)

print(temperature_sorted_asc)
print(temperature_sorted_desc)


2    20
1    21
7    22
0    23
8    24
3    25
9    26
4    27
6    28
5    30
dtype: int64
5    30
6    28
4    27
9    26
3    25
8    24
0    23
7    22
1    21
2    20
dtype: int64


In [83]:
# 5. Merging DataFrames
# Creating the orders and customers DataFrames
orders = pd.DataFrame({
    'OrderID': [1, 2, 3, 4, 5],
    'CustomerID': [101, 102, 103, 104, 101],
    'Product': ['A', 'B', 'A', 'C', 'B']
})

customers = pd.DataFrame({
    'CustomerID': [101, 102, 103, 104],
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Location': ['New York', 'Los Angeles', 'Chicago', 'Houston']
})

In [85]:
merged_data = pd.merge(orders, customers, on='CustomerID')

In [87]:
print(merged_data)

   OrderID  CustomerID Product     Name     Location
0        1         101       A    Alice     New York
1        2         102       B      Bob  Los Angeles
2        3         103       A  Charlie      Chicago
3        4         104       C    David      Houston
4        5         101       B    Alice     New York


In [89]:
orders_by_location = merged_data.groupby('Location')['OrderID'].count()
print(orders_by_location)

Location
Chicago        1
Houston        1
Los Angeles    1
New York       2
Name: OrderID, dtype: int64
