## Write and Run Your First Cell

In [746]:
import pandas as pd

#Creating a simple DataFrame
data = {
    'Name':['Alice', 'Bob', 'Charlie', 'Kimanthi', 'Kaven', 'Sheila', 'Erick'],
    'Age':[25, 30, 35, 23, 18, 22, 28],
    'City':['Nairobi', 'Mombasa', 'Kisumu', 'Juja', 'Juja', 'Meru', 'Mwea'],
    'Occupation':['Thief', 'Scammer', 'Plumber', 'Network Engineer', 'Nurse', 'Chef', 'Doctor']
}

df = pd.DataFrame(data)

# Display DataFrame
df

Unnamed: 0,Name,Age,City,Occupation
0,Alice,25,Nairobi,Thief
1,Bob,30,Mombasa,Scammer
2,Charlie,35,Kisumu,Plumber
3,Kimanthi,23,Juja,Network Engineer
4,Kaven,18,Juja,Nurse
5,Sheila,22,Meru,Chef
6,Erick,28,Mwea,Doctor


# 1. Exploring DataFrames (Columns, Rows, Info, Describe)

Checking the First and Last Few Rows

In [747]:
print(df.head()) # First 5 rows
print(df.tail(2)) # Last 2 rows

       Name  Age     City        Occupation
0     Alice   25  Nairobi             Thief
1       Bob   30  Mombasa           Scammer
2   Charlie   35   Kisumu           Plumber
3  Kimanthi   23     Juja  Network Engineer
4     Kaven   18     Juja             Nurse
     Name  Age  City Occupation
5  Sheila   22  Meru       Chef
6   Erick   28  Mwea     Doctor


Getting Basics Information

In [748]:
#print(df.info()) # Overview of columns, data types and missing values
#print(df.shape) #Get the number of rows and columns
#print(df.columns) #List all column names
#print(df.index) #Show index range
print(df.columns)

Index(['Name', 'Age', 'City', 'Occupation'], dtype='object')


Summary Statistics

In [749]:
print(df.describe()) #Summary statistics for numerical columns
print(df.describe(include='all')) #Include non-numeric columns

             Age
count   7.000000
mean   25.857143
std     5.639993
min    18.000000
25%    22.500000
50%    25.000000
75%    29.000000
max    35.000000
         Name        Age  City Occupation
count       7   7.000000     7          7
unique      7        NaN     6          7
top     Alice        NaN  Juja      Thief
freq        1        NaN     2          1
mean      NaN  25.857143   NaN        NaN
std       NaN   5.639993   NaN        NaN
min       NaN  18.000000   NaN        NaN
25%       NaN  22.500000   NaN        NaN
50%       NaN  25.000000   NaN        NaN
75%       NaN  29.000000   NaN        NaN
max       NaN  35.000000   NaN        NaN


## 2. Data Selection & Filtering

>Selecting Columns

In [750]:
print(df['Name']) #Returns a single column as a Series

0       Alice
1         Bob
2     Charlie
3    Kimanthi
4       Kaven
5      Sheila
6       Erick
Name: Name, dtype: object


To get multiple Columns

In [751]:
print(df[['Name', 'City']]) #Returns a DataFrame with selected columns

       Name     City
0     Alice  Nairobi
1       Bob  Mombasa
2   Charlie   Kisumu
3  Kimanthi     Juja
4     Kaven     Juja
5    Sheila     Meru
6     Erick     Mwea


Selecting Rows Using loc[] and iloc[]


Using loc[] (Label-based selection)

In [752]:
print(df.loc[0]) #Get the first row by index label

Name            Alice
Age                25
City          Nairobi
Occupation      Thief
Name: 0, dtype: object


In [753]:
print(df.loc[0:1]) #Get first two rows(inclusive)

    Name  Age     City Occupation
0  Alice   25  Nairobi      Thief
1    Bob   30  Mombasa    Scammer


Using iloc[] (Position-based selection)

In [754]:
print(df.iloc[0]) #Get the first row by position


Name            Alice
Age                25
City          Nairobi
Occupation      Thief
Name: 0, dtype: object


In [755]:
print(df.iloc[0:2]) #Get first two rows(excluding 2)

    Name  Age     City Occupation
0  Alice   25  Nairobi      Thief
1    Bob   30  Mombasa    Scammer


Filtering Data(Condition Selection)


Filter rows where Age is greater than 25

In [756]:
print(df[df['Age'] > 25])

      Name  Age     City Occupation
1      Bob   30  Mombasa    Scammer
2  Charlie   35   Kisumu    Plumber
6    Erick   28     Mwea     Doctor


Filter rows where City is "Juja"

In [757]:
print(df[df['City'] == 'Juja'])

       Name  Age  City        Occupation
3  Kimanthi   23  Juja  Network Engineer
4     Kaven   18  Juja             Nurse


Filter with multiple conditions (& for AND, | for OR)

In [758]:
print(df[(df['Age'] > 25) & (df['City'] == 'Mombasa') | (df['City'] == 'Mwea')])

    Name  Age     City Occupation
1    Bob   30  Mombasa    Scammer
6  Erick   28     Mwea     Doctor


In [759]:
print(df[(df['Age'] < 30) | (df['City'] == 'Juja')])

       Name  Age     City        Occupation
0     Alice   25  Nairobi             Thief
3  Kimanthi   23     Juja  Network Engineer
4     Kaven   18     Juja             Nurse
5    Sheila   22     Meru              Chef
6     Erick   28     Mwea            Doctor


## 3. Modifying DataFrames

Adding a new column

In [760]:
# Create a new column called "Salary" and set values
df['Salary'] = [10000, 12000, 8000, 350000, 120000, 90000, 300000]
print(df)

       Name  Age     City        Occupation  Salary
0     Alice   25  Nairobi             Thief   10000
1       Bob   30  Mombasa           Scammer   12000
2   Charlie   35   Kisumu           Plumber    8000
3  Kimanthi   23     Juja  Network Engineer  350000
4     Kaven   18     Juja             Nurse  120000
5    Sheila   22     Meru              Chef   90000
6     Erick   28     Mwea            Doctor  300000


In [761]:
# Create a column based on existing data
df['Age in 10 years'] = df['Age'] + 10
print(df)

       Name  Age     City        Occupation  Salary  Age in 10 years
0     Alice   25  Nairobi             Thief   10000               35
1       Bob   30  Mombasa           Scammer   12000               40
2   Charlie   35   Kisumu           Plumber    8000               45
3  Kimanthi   23     Juja  Network Engineer  350000               33
4     Kaven   18     Juja             Nurse  120000               28
5    Sheila   22     Meru              Chef   90000               32
6     Erick   28     Mwea            Doctor  300000               38


Updating Values

In [762]:
#Modifying a single value
df.at[1, 'Age'] = 32 #update Age for Bob(index 1)
df.at[3, 'Salary'] = 400000
print(df)

       Name  Age     City        Occupation  Salary  Age in 10 years
0     Alice   25  Nairobi             Thief   10000               35
1       Bob   32  Mombasa           Scammer   12000               40
2   Charlie   35   Kisumu           Plumber    8000               45
3  Kimanthi   23     Juja  Network Engineer  400000               33
4     Kaven   18     Juja             Nurse  120000               28
5    Sheila   22     Meru              Chef   90000               32
6     Erick   28     Mwea            Doctor  300000               38


In [763]:
#Modify a column based on a condition
df.loc[df['City'] == 'Mwea', 'Salary'] = 350000
print(df)

       Name  Age     City        Occupation  Salary  Age in 10 years
0     Alice   25  Nairobi             Thief   10000               35
1       Bob   32  Mombasa           Scammer   12000               40
2   Charlie   35   Kisumu           Plumber    8000               45
3  Kimanthi   23     Juja  Network Engineer  400000               33
4     Kaven   18     Juja             Nurse  120000               28
5    Sheila   22     Meru              Chef   90000               32
6     Erick   28     Mwea            Doctor  350000               38


Deleting Columns and Rows

In [764]:
#Remove a column
df.drop(columns=['Age in 10 years'], inplace=True)
print(df)

       Name  Age     City        Occupation  Salary
0     Alice   25  Nairobi             Thief   10000
1       Bob   32  Mombasa           Scammer   12000
2   Charlie   35   Kisumu           Plumber    8000
3  Kimanthi   23     Juja  Network Engineer  400000
4     Kaven   18     Juja             Nurse  120000
5    Sheila   22     Meru              Chef   90000
6     Erick   28     Mwea            Doctor  350000


In [765]:
#Remove a row
df.drop(index=2, inplace=True) #Remove row with index 2 (Charlie)
print(df)

       Name  Age     City        Occupation  Salary
0     Alice   25  Nairobi             Thief   10000
1       Bob   32  Mombasa           Scammer   12000
3  Kimanthi   23     Juja  Network Engineer  400000
4     Kaven   18     Juja             Nurse  120000
5    Sheila   22     Meru              Chef   90000
6     Erick   28     Mwea            Doctor  350000


## 4. Sorting & Grouping in Pandas

Sorting Data

In [766]:
#Sort by single column
df_sorted = df.sort_values(by='Age') #Sort by age(Ascending by deafault)
print(df_sorted)

       Name  Age     City        Occupation  Salary
4     Kaven   18     Juja             Nurse  120000
5    Sheila   22     Meru              Chef   90000
3  Kimanthi   23     Juja  Network Engineer  400000
0     Alice   25  Nairobi             Thief   10000
6     Erick   28     Mwea            Doctor  350000
1       Bob   32  Mombasa           Scammer   12000


In [767]:
#Sort in descending order
df_sorted_desc = df.sort_values(by='Salary', ascending=False)
print(df_sorted_desc)

       Name  Age     City        Occupation  Salary
3  Kimanthi   23     Juja  Network Engineer  400000
6     Erick   28     Mwea            Doctor  350000
4     Kaven   18     Juja             Nurse  120000
5    Sheila   22     Meru              Chef   90000
1       Bob   32  Mombasa           Scammer   12000
0     Alice   25  Nairobi             Thief   10000


In [768]:
#Sort by multiple coumns
df_sorted_multi = df.sort_values(by=['City', 'Age'], ascending=[True, False])
print(df_sorted_multi)

       Name  Age     City        Occupation  Salary
3  Kimanthi   23     Juja  Network Engineer  400000
4     Kaven   18     Juja             Nurse  120000
5    Sheila   22     Meru              Chef   90000
1       Bob   32  Mombasa           Scammer   12000
6     Erick   28     Mwea            Doctor  350000
0     Alice   25  Nairobi             Thief   10000


Grouping & Aggregation

In [769]:
#Group by a column and count
df_grouped = df.groupby('City').size()
print(df_grouped)

City
Juja       2
Meru       1
Mombasa    1
Mwea       1
Nairobi    1
dtype: int64


In [770]:
#Group and calculate the average age per city
df_avg_age = df.groupby('City')['Age'].mean()
print(df_avg_age)

City
Juja       20.5
Meru       22.0
Mombasa    32.0
Mwea       28.0
Nairobi    25.0
Name: Age, dtype: float64


In [771]:
#Group and calculate multiple stats(mean, max, min)
df_summary = df.groupby('City').agg({'Age': ['mean', 'min', 'max']})
print(df_summary)

          Age        
         mean min max
City                 
Juja     20.5  18  23
Meru     22.0  22  22
Mombasa  32.0  32  32
Mwea     28.0  28  28
Nairobi  25.0  25  25


## 5. Handling Missing Data

Checking for missing Values

In [772]:
#To check if there are missing values in the DataFrame
print(df.isnull()) #Shows True for missing values, false otherwise
print(df.isnull().sum()) #Count missing values per column

    Name    Age   City  Occupation  Salary
0  False  False  False       False   False
1  False  False  False       False   False
3  False  False  False       False   False
4  False  False  False       False   False
5  False  False  False       False   False
6  False  False  False       False   False
Name          0
Age           0
City          0
Occupation    0
Salary        0
dtype: int64


Filling Missing Values

In [773]:
#Fill with a specific value
df.fillna("Unknown", inplace=True)

In [774]:
#Fill with the column mean (for numeric data)
df['Age'].fillna(df['Age'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)


Fill with the previous value (ffill) or next value (bfill)

In [775]:
df.fillna(method='ffill', inplace=True)  # Forward fill
df.fillna(method='bfill', inplace=True)  # Backward fill


  df.fillna(method='ffill', inplace=True)  # Forward fill
  df.fillna(method='bfill', inplace=True)  # Backward fill
