In [36]:
!pip install pandas



In [37]:
import pandas as pd
import numpy as np

**Creating a Pandas Series**

A Series is a one-dimensional labeled array capable of holding any data type.

*Creating a series from list*

In [38]:
data = [10,20,30,40,50]
series = pd.Series(data)
print(series)

0    10
1    20
2    30
3    40
4    50
dtype: int64


*Creating a series with custom index*

In [39]:
data = [100,200,300,400]
new_index = ['a','b','c','d']
series = pd.Series(data, index = new_index)
print(series)

a    100
b    200
c    300
d    400
dtype: int64


 *Accessing Elements in a Series*

In [40]:
#accessing by index
print(series,"\n")
print(series['b'],"\n")

#accessing multiple values
print(series[['a','b','d']],"\n")

a    100
b    200
c    300
d    400
dtype: int64 

200 

a    100
b    200
d    400
dtype: int64 



*performing operations on series*

In [41]:
#scalar operations
print(series * 2,"\n")

#applying functions
print(series.mean())
print(series.sum())
print(series.max())
print(series.min())
print(series.std())

a    200
b    400
c    600
d    800
dtype: int64 

250.0
1000
400
100
129.09944487358058


*Filtering & Conditional Selection in Pandas Series*

In [42]:
#Filtering elements(conditional selection)
print(series,"\n")
print(series[series > 200],"\n") # Select values greater than 200

#checking for missing values
print(series.isnull(),"\n") # Check which values are NaN
print(series.notnull(),"\n") # Check which values are NOT NaN

a    100
b    200
c    300
d    400
dtype: int64 

c    300
d    400
dtype: int64 

a    False
b    False
c    False
d    False
dtype: bool 

a    True
b    True
c    True
d    True
dtype: bool 



*Handling Missing Data in Series*

Sometimes, a Series can have NaN (Not a Number) values. We can handle them using:

dropna() → Removes missing values,

fillna(value) → Replaces missing values with a specified value

In [43]:
# Creating a series with missing values
data = [10,20,np.nan,40,np.nan,60]
series = pd.Series(data)

print("original series:\n",series,"\n")

# Dropping missing values
print("After dropping NaNs:\n",series.dropna(),"\n")

# Filling missing values with a specific number
print("After filling missing values:\n",series.fillna(0),"\n")

# Filling missing values with the mean of the series
print("After filling missing values with mean of series:\n",series.fillna(series.mean()),"\n")

original series:
 0    10.0
1    20.0
2     NaN
3    40.0
4     NaN
5    60.0
dtype: float64 

After dropping NaNs:
 0    10.0
1    20.0
3    40.0
5    60.0
dtype: float64 

After filling missing values:
 0    10.0
1    20.0
2     0.0
3    40.0
4     0.0
5    60.0
dtype: float64 

After filling missing values with mean of series:
 0    10.0
1    20.0
2    32.5
3    40.0
4    32.5
5    60.0
dtype: float64 



**Creating a DataFrame**

*We can create a DataFrame from different sources like:*

✅ Python dictionaries
✅ Lists of lists
✅ NumPy arrays
✅ Reading from CSV/Excel

1.Creating DataFrame from python dictionary

In [44]:
# Creating a dictionary
data = {
    "name":['Alice','Bob','Charlie'],
    "age":[25,30,35],
    "salary":[50000,60000,70000]
}

# Creating DataFrame
df = pd.DataFrame(data)

# Display DataFrame
print(df)

      name  age  salary
0    Alice   25   50000
1      Bob   30   60000
2  Charlie   35   70000


2.Creating DataFrame from list of lists.

In [45]:
data_list = [["Alice",25,50000],["Bob",30,60000],["Charlie",40,70000]]
df_list = pd.DataFrame(data_list, columns=["Name","Age","Salary"])

print(df_list)

      Name  Age  Salary
0    Alice   25   50000
1      Bob   30   60000
2  Charlie   40   70000


3.Creating DataFrame from a numpy array.

In [46]:
data_np = np.array([["Alice",25,50000],["Bob",30,60000],["Charlie",40,70000]])
df_np = pd.DataFrame(data_np, columns=["Name","Age","Salary"])

print(df_np)


      Name Age Salary
0    Alice  25  50000
1      Bob  30  60000
2  Charlie  40  70000


**Accessing Data in a DataFrame**

*Accessing Columns*

In [47]:
print(df["name"],"\n") #access the name column
print(df[["name","salary"]]) # access multiple columns


0      Alice
1        Bob
2    Charlie
Name: name, dtype: object 

      name  salary
0    Alice   50000
1      Bob   60000
2  Charlie   70000


**Accessing Rows by Indexes**

In [48]:
print(df.loc[1],"\n") # access the row with index 1
print(df.iloc[0]) #access the first row using position

name        Bob
age          30
salary    60000
Name: 1, dtype: object 

name      Alice
age          25
salary    50000
Name: 0, dtype: object


**Why use .iloc[] instead of .loc[]?**

*If your DataFrame does not have numerical indices (e.g., a custom index like names or dates), df.loc[0] may not work.*

*iloc always works because it strictly uses row positions, no matter what the index labels are.*

So, use loc when referring to labels and iloc when referring to positions.



**Slicing Rows**

In [49]:
print(df[0:2]) # Get first two rows

    name  age  salary
0  Alice   25   50000
1    Bob   30   60000


**Modifying Data in a DataFrame**

*Adding a New Column*

In [50]:
df["bonus"] = [5000,7000,8000] # Adding a new column
print(df)

      name  age  salary  bonus
0    Alice   25   50000   5000
1      Bob   30   60000   7000
2  Charlie   35   70000   8000


*Modifying an Existing Column*

In [51]:
df["salary"] = df["salary"] + 5000 # Increasing salary by 5000
print(df)

      name  age  salary  bonus
0    Alice   25   55000   5000
1      Bob   30   65000   7000
2  Charlie   35   75000   8000


*Updating a Specific Row*

In [52]:
df.loc[1,"age"] = 32 # Update Bob's age to 32
print(df)

      name  age  salary  bonus
0    Alice   25   55000   5000
1      Bob   32   65000   7000
2  Charlie   35   75000   8000


In [53]:
'''You can use .iloc[] instead of .loc[], but there's a key difference:

.loc[] is label-based (uses column names).

.iloc[] is position-based (uses index positions).

Using .iloc[] to Modify a Value:

Since age is the 2nd column (index 1 in zero-based indexing), you can update Bob’s age like this:

df.iloc[1, 1] = 32  # Update Bob's age to 32

print(df)

This means:

1 → Selects second row (Bob).

1 → Selects second column (age).

Both .loc[] and .iloc[] work, but .loc[] is preferred when working with column names for better readability. '''

"You can use .iloc[] instead of .loc[], but there's a key difference:\n\n.loc[] is label-based (uses column names).\n\n.iloc[] is position-based (uses index positions).\n\nUsing .iloc[] to Modify a Value:\n\nSince age is the 2nd column (index 1 in zero-based indexing), you can update Bob’s age like this:\n\ndf.iloc[1, 1] = 32  # Update Bob's age to 32\n\nprint(df)\n\nThis means:\n\n1 → Selects second row (Bob).\n\n1 → Selects second column (age).\n\nBoth .loc[] and .iloc[] work, but .loc[] is preferred when working with column names for better readability. "

*Deleting a Column*

In [54]:
df = df.drop(columns=["bonus"]) # Remove 'bonus' column
print(df)

      name  age  salary
0    Alice   25   55000
1      Bob   32   65000
2  Charlie   35   75000


**Filtering, Sorting, and Aggregations in DataFrames**

*Filtering Data*

In [55]:
# Select rows where salary is greater than 60,000
great_60 = df[df["salary"] > 60000]
print(great_60)

      name  age  salary
1      Bob   32   65000
2  Charlie   35   75000


*Sorting Data*

In [56]:
# Sort by age in ascending order
sorted_df = df.sort_values(by="age")
print(sorted_df)

# Sort by salary in descending order
sorted_df = df.sort_values(by= "salary", ascending=False)
print(sorted_df)

      name  age  salary
0    Alice   25   55000
1      Bob   32   65000
2  Charlie   35   75000
      name  age  salary
2  Charlie   35   75000
1      Bob   32   65000
0    Alice   25   55000


*Aggregation Functions*

In [57]:
# Get the average salary
print("Average salary:", df["salary"].mean())

# Get the highest salary
print("Highest salary:", df["salary"].max())

# Get the lowest salary
print("Lowest salary:", df["salary"].min())

Average salary: 65000.0
Highest salary: 75000
Lowest salary: 55000


**Handling Missing Data (dropna, fillna)**

Sometimes, data contains missing values (NaN). Pandas provides ways to handle them efficiently.

*Checking for Missing Values*

In [58]:
print(df.isnull()) #check which values are NaN
print(df.notnull()) #check which values are NOT NaN

    name    age  salary
0  False  False   False
1  False  False   False
2  False  False   False
   name   age  salary
0  True  True    True
1  True  True    True
2  True  True    True


*Creating a DataFrame with Missing Values*

In [59]:
data = {
    "name": ["Alice", "Bob", "Charlie", "David"],
    "age": [25, np.nan, 35, 40],
    "salary": [50000, 60000, np.nan, 80000]
}

df = pd.DataFrame(data)
print("Original dataframe:\n",df)

Original dataframe:
       name   age   salary
0    Alice  25.0  50000.0
1      Bob   NaN  60000.0
2  Charlie  35.0      NaN
3    David  40.0  80000.0


*Dropping Missing Values*

In [60]:
df_dropped = df.dropna() # Removes rows with NaN values
print("\nAfter dropping NaNs:\n",df_dropped)


After dropping NaNs:
     name   age   salary
0  Alice  25.0  50000.0
3  David  40.0  80000.0


*Filling Missing Values*

In [61]:
df_filled = df.fillna(0) # Replace NaNs with 0
print("\nAfter filling NaNs with 0:\n",df_filled)

df_filled_mean = df.fillna(df['salary'].mean()) # Replace NaNs with salary column mean
print("\nAfter Filling NaNs with Column Mean:\n", df_filled_mean)


After filling NaNs with 0:
       name   age   salary
0    Alice  25.0  50000.0
1      Bob   0.0  60000.0
2  Charlie  35.0      0.0
3    David  40.0  80000.0

After Filling NaNs with Column Mean:
       name           age        salary
0    Alice     25.000000  50000.000000
1      Bob  63333.333333  60000.000000
2  Charlie     35.000000  63333.333333
3    David     40.000000  80000.000000


**Importing & Exporting Data**

Pandas makes it easy to read and write data in different formats like CSV and Excel.

*Importing Data from CSV*

In [62]:
df = pd.read_csv("data.csv") #load csv file into a dataframe
print(df.head()) # display first 5 rows

   name   age      role        salary  Unnamed: 4
0  sush  21.0       ceo  50,00,00,000         NaN
1   bob  25.0   manager     50,00,000         NaN
2   sam  40.0  director         45555         NaN
3   roy  34.0  employee         34243         NaN
4  rita  23.0   support        454232         NaN


*Exporting Data to CSV*

In [63]:
df.to_csv("output.csv", index=False) # Save DataFrame as CSV without index

*Importing Data from Excel*

In [64]:
!pip install openpyxl
df = pd.read_excel("data.xlsx", sheet_name="Sheet1")
print(df.head())
!python.exe -m pip install --upgrade pip

Empty DataFrame
Columns: []
Index: []


*Exporting Data to Excel*

In [65]:
df.to_excel("output.xlsx", index=False, sheet_name="Sheet1")