In [1]:
import pandas as pd

In [2]:
#creating pandas series with and without index
#without specifying the index, by default index values from 0 will assigned

traffic_data = [500, 600, 800]
traffic_index = ['2025-11-20', '2025-11-21', '2025-11-23']
df_traffic = pd.Series(traffic_data,index=traffic_index)
df_traffic.head()

2025-11-20    500
2025-11-21    600
2025-11-23    800
dtype: int64

In [3]:
df_traffic_default = pd.Series(traffic_data)
df_traffic_default.head()

0    500
1    600
2    800
dtype: int64

In [None]:
#arithmatic operations using index

traffic_data = [500, 600, 800]
traffic_index = ['2025-11-20', '2025-11-21', '2025-11-23']
df_traffic = pd.Series(traffic_data,index=traffic_index)

ad_spend_data = [100, 120, 150]
ad_spend_index = ['2025-11-20', '2025-11-22', '2025-11-23']
df_ad_spend = pd.Series(ad_spend_data,index=ad_spend_index)

new_df = df_traffic + df_ad_spend
new_df.head() #notice how the index values are union from both the series

2025-11-20    600.0
2025-11-21      NaN
2025-11-22      NaN
2025-11-23    950.0
dtype: float64

In [6]:
#referrencing using positional index and labelled index
new_df.loc["2025-11-23"]

np.float64(950.0)

In [7]:
df_traffic_default.iloc[0]

np.int64(500)

In [16]:
#setting and resetting the index
df_inventory = pd.DataFrame({
    'SKU': ['SKU001', 'SKU002', 'SKU003'],
    'Name': ['Keyboard', 'Monitor', 'Mouse'],
    'Stock': [150, 80, 220]
})


In [15]:
df_inventory.head()

Unnamed: 0,SKU,Name,Stock
0,SKU001,Keyboard,150
1,SKU002,Monitor,80
2,SKU003,Mouse,220


In [18]:
#setting the index
df_inventory_set = df_inventory.set_index("Name")
df_inventory_set.head()

Unnamed: 0_level_0,SKU,Stock
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Keyboard,SKU001,150
Monitor,SKU002,80
Mouse,SKU003,220


In [20]:
#resetting the index
df_inventory_set_reset = df_inventory_set.reset_index(names=["Name"])
df_inventory_set_reset.head()

Unnamed: 0,Name,SKU,Stock
0,Keyboard,SKU001,150
1,Monitor,SKU002,80
2,Mouse,SKU003,220


In [21]:
#multiline Index
data = {
    'Region': ['East', 'East', 'West', 'West', 'East', 'West'],
    'Product': ['A', 'B', 'A', 'B', 'A', 'A'],
    'Sales': [100, 150, 200, 120, 110, 220]
}

df_data = pd.DataFrame(data)
df_data.head()

Unnamed: 0,Region,Product,Sales
0,East,A,100
1,East,B,150
2,West,A,200
3,West,B,120
4,East,A,110


In [23]:
data_multi = df_data.set_index(["Region","Product"])
data_multi.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales
Region,Product,Unnamed: 2_level_1
East,A,100
East,B,150
West,A,200
West,B,120
East,A,110


In [24]:
#Datetime Index

data = {
    'Timestamp': ['2025-01-15 08:00:00', '2025-01-15 09:00:00', '2025-01-16 08:00:00', '2025-01-16 09:00:00'],
    'Temperature_C': [20.5, 21.1, 22.0, 21.5]
}
df = pd.DataFrame(data)

# Convert the column to datetime objects and set it as the Index
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df_time_indexed = df.set_index('Timestamp')

print(df_time_indexed)

                     Temperature_C
Timestamp                         
2025-01-15 08:00:00           20.5
2025-01-15 09:00:00           21.1
2025-01-16 08:00:00           22.0
2025-01-16 09:00:00           21.5


In [25]:
# time based slicing after datetime indexing
print(df_time_indexed.loc["2025-01-15 08:00:00"])

Temperature_C    20.5
Name: 2025-01-15 08:00:00, dtype: float64


In [26]:
#range slicing
print(df_time_indexed.loc["2025-01-15 08:00:00":"2025-01-16 08:00:00"])

                     Temperature_C
Timestamp                         
2025-01-15 08:00:00           20.5
2025-01-15 09:00:00           21.1
2025-01-16 08:00:00           22.0


In [28]:
#dealing with bad data and converting the data from strings to integers

sales_series = pd.Series(['45.5', '20', 'N/A', '10.2', 'Error', '5'])
print(sales_series.dtype)

object


In [None]:
# errors = 'raise' will throw an error when the bad data is found
#errors = "ignore" will stop the conversion when it finds any error
#errors = "coerce"
# Invalid values ('N/A', 'Error') are replaced with NaN
sales_series = pd.to_numeric(sales_series, errors="coerce")
print(sales_series.dtype)

float64


In [31]:
# Data includes valid dates and one malformed string '2025/35/01'
date_series = pd.Series(['2025-11-20', '2025/11/21', '2025/35/01', '2025-11-23'])
print("\nOriginal Date Series Dtype:", date_series.dtype)


Original Date Series Dtype: object


In [32]:
date_series = pd.to_datetime(date_series, errors = "coerce")
print("\nOriginal Date Series Dtype:", date_series.dtype)


Original Date Series Dtype: datetime64[ns]


In [None]:
#converting individual columns from a dataframe one by one

# 1. Convert 'Revenue' to numeric, coercing bad values to NaN
df_logs['Revenue'] = pd.to_numeric(df_logs['Revenue'], errors='coerce')

# 2. Convert 'Event_Time' to datetime, coercing bad values to NaT
df_logs['Event_Time'] = pd.to_datetime(df_logs['Event_Time'], errors='coerce')

# 3. Convert 'Units_Sold' to integer, but first, handle any NaNs created
# We use fillna(0) for simplicity, or dropna() for strictness, then convert to int
df_logs['Units_Sold'] = pd.to_numeric(df_logs['Units_Sold'], errors='coerce').fillna(0).astype(int)

In [None]:
# what if there are many columns to be converted 

# Assume 'df' is your DataFrame
# List of columns that need to be converted to numeric (e.g., from an 'object' dtype)
numeric_columns = ['sensor_reading_1', 'sensor_reading_2', 'price_usd', 'quantity']

# Apply pd.to_numeric to all selected columns simultaneously
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Example for converting all columns to 'category' (for memory optimization)
category_cols = ['user_city', 'product_type', 'platform_os']
df[category_cols] = df[category_cols].astype('category')

In [None]:
#convert to string

# Convert 'User_ID' column (which might be an integer or float) to a string
df_logs['User_ID'] = df_logs['User_ID'].astype(str)

# Or, use the Pandas string dtype alias 'string' for better handling of missing values (NaN)
df_logs['User_ID'] = df_logs['User_ID'].astype('string')