# Pandas Notebook

In [None]:
!pip install pandas

In [3]:
import pandas as pd
import numpy as np

pd.set_option("display.precision", 2)
pd.set_option("display.width", 100)

## 1) Mental model: dict of Series + axis

In [4]:
df = pd.DataFrame({
    "Name": ["Ana", "Ben", "Cara"],
    "Age": [28, 35, 22],
    "City": ["NY", "Paris", "NY"]
})
df

Unnamed: 0,Name,Age,City
0,Ana,28,NY
1,Ben,35,Paris
2,Cara,22,NY


In [5]:
df.columns.tolist()

['Name', 'Age', 'City']

In [6]:
df["Name"]

0     Ana
1     Ben
2    Cara
Name: Name, dtype: object

In [7]:
df["Age"].mean(axis=0)  # axis=0 => down rows (per column)

np.float64(28.333333333333332)

In [8]:
df[["Age", "Is Dummy"]] = df[["Age"]].assign(**{"Is Dummy": [1, 1, 1]})
df[["Age", "Is Dummy"]].mean(axis=1)  # axis=1 => across columns (per row)

0    14.5
1    18.0
2    11.5
dtype: float64

## 2) Selection (loc), filtering, assignment

In [9]:
df.loc[0, "Name"]

'Ana'

In [10]:
df.loc[0:1, ["Name", "Age"]]

Unnamed: 0,Name,Age
0,Ana,28
1,Ben,35


In [11]:
df[df["Age"] > 25]

Unnamed: 0,Name,Age,City,Is Dummy
0,Ana,28,NY,1
1,Ben,35,Paris,1


In [12]:
df[(df["City"] == "NY") & (df["Age"] < 30)]

Unnamed: 0,Name,Age,City,Is Dummy
0,Ana,28,NY,1
2,Cara,22,NY,1


In [13]:
df["Is_Adult"] = df["Age"] >= 18
df

Unnamed: 0,Name,Age,City,Is Dummy,Is_Adult
0,Ana,28,NY,1,True
1,Ben,35,Paris,1,True
2,Cara,22,NY,1,True


## 3) Inspection

In [14]:
df.head()

Unnamed: 0,Name,Age,City,Is Dummy,Is_Adult
0,Ana,28,NY,1,True
1,Ben,35,Paris,1,True
2,Cara,22,NY,1,True


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Name      3 non-null      object
 1   Age       3 non-null      int64 
 2   City      3 non-null      object
 3   Is Dummy  3 non-null      int64 
 4   Is_Adult  3 non-null      bool  
dtypes: bool(1), int64(2), object(2)
memory usage: 231.0+ bytes


In [16]:
df.describe(include="all")

Unnamed: 0,Name,Age,City,Is Dummy,Is_Adult
count,3,3.0,3,3.0,3
unique,3,,2,,1
top,Ana,,NY,,True
freq,1,,2,,3
mean,,28.33,,1.0,
std,,6.51,,0.0,
min,,22.0,,1.0,
25%,,25.0,,1.0,
50%,,28.0,,1.0,
75%,,31.5,,1.0,


In [17]:
df.shape, df.columns.tolist()

((3, 5), ['Name', 'Age', 'City', 'Is Dummy', 'Is_Adult'])

## 4) Cleaning

In [18]:
dirty = pd.DataFrame({"A": [1.0, np.nan, 1.0], "B": ["x", "x", "x"]})
dirty

Unnamed: 0,A,B
0,1.0,x
1,,x
2,1.0,x


In [19]:
dirty.dropna()

Unnamed: 0,A,B
0,1.0,x
2,1.0,x


In [20]:
dirty.fillna(0)

Unnamed: 0,A,B
0,1.0,x
1,0.0,x
2,1.0,x


In [21]:
dupes = pd.concat([dirty, dirty.iloc[[0]]], ignore_index=True)
dupes

Unnamed: 0,A,B
0,1.0,x
1,,x
2,1.0,x
3,1.0,x


In [22]:
dupes.drop_duplicates()

Unnamed: 0,A,B
0,1.0,x
1,,x


## 5) GroupBy (split-apply-combine)

In [23]:
df.groupby("City")["Age"].mean()

City
NY       25.0
Paris    35.0
Name: Age, dtype: float64

## 6) Concatenation and merge

In [24]:
df_top = pd.DataFrame({"A": [1, 2], "B": [5, 6]})
df_bottom = pd.DataFrame({"A": [3, 4], "B": [7, 8]})
pd.concat([df_top, df_bottom], ignore_index=True)

Unnamed: 0,A,B
0,1,5
1,2,6
2,3,7
3,4,8


In [25]:
left = pd.DataFrame({"key": ["A", "B"], "val1": [1, 2]})
right = pd.DataFrame({"key": ["B", "C"], "val2": [3, 4]})
pd.merge(left, right, on="key", how="inner")

Unnamed: 0,key,val1,val2
0,B,2,3


In [26]:
pd.merge(left, right, on="key", how="left")

Unnamed: 0,key,val1,val2
0,A,1,
1,B,2,3.0


## 7) Map and apply

In [27]:
gender_df = pd.DataFrame({"Name": ["Ana", "Ben"], "Gender": ["Female", "Male"]})
gender_df["Gender_Code"] = gender_df["Gender"].map({"Male": 0, "Female": 1})
gender_df["Name_Length"] = gender_df["Name"].map(len)
gender_df

Unnamed: 0,Name,Gender,Gender_Code,Name_Length
0,Ana,Female,1,3
1,Ben,Male,0,3


In [28]:
def summarize(row):
    return f"{row['Name']} is {row['Gender'].lower()}"

gender_df["Summary"] = gender_df.apply(summarize, axis=1)
gender_df

Unnamed: 0,Name,Gender,Gender_Code,Name_Length,Summary
0,Ana,Female,1,3,Ana is female
1,Ben,Male,0,3,Ben is male
