<a href="https://colab.research.google.com/github/zuhayerror3i8/AI-ML-Expert-With-Phitron-Batch-01/blob/main/000%20Python%20For%20ML/015_Module_12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Module 12 â€” Data Analysis with Pandas

In [None]:
# <--- Introduction to Pandas DataFrames --->
import pandas as pd

df = pd.read_csv('student_data.csv')
print(df)
print(type(df))  # Output: <class 'pandas.core.frame.DataFrame'>

In [None]:
# Accessing a single column from DataFrame
student_id = df['StudentID']
print(student_id)
print(type(student_id))  # Output: <class 'pandas.core.series.Series'>

## Loading Different File Types in Pandas

In [None]:
# <--- Reading CSV file --->
csv_data = pd.read_csv('student_data.csv')

print(csv_data)
print(type(csv_data))  # Output: <class 'pandas.core.frame.DataFrame'>

In [None]:
# <--- Reading Excel file --->
excel_file = pd.read_excel('phitron_student_marks.xlsx')

print(excel_file)
print(type(excel_file))  # Output: <class 'pandas.core.frame.DataFrame'>

In [None]:
# <--- Reading Parquet file --->
parquet_file = pd.read_parquet('students.parquet')

print(parquet_file)
print(type(parquet_file))  # Output: <class 'pandas.core.frame.DataFrame'>

In [None]:
# <--- Reading JSON file --->
json_file = pd.read_json('data.json')

print(json_file)
print(type(json_file))  # Output: <class 'pandas.core.frame.DataFrame'>

## Pandas Basic Functionalities

In [None]:
# <--- DataFrame Basic Operations --->
df = pd.read_csv('student_data.csv')
df

In [None]:
# Display first few rows (default 5, can specify number)
df.head(4)

In [None]:
# Display last few rows (default 5)
df.tail()

In [None]:
# Get column names
import numpy as np

df.columns
col = np.array(df.columns)
print(col.dtype)  # Output: object

In [None]:
# Get index values
df.index
ind = np.array(df.index)
print(ind)
print(ind.dtype)  # Output: int64

In [None]:
# Display DataFrame information (columns, dtypes, memory usage)
df.info()

In [None]:
# Get random sample of rows
df.sample(10)

In [None]:
# Get statistical summary of numerical columns
df.describe()

## Creating DataFrames from Python Data Structures

In [None]:
# <--- Creating DataFrame from List --->
my_list = [['Alice', 25], ['Bob', 30], ['Charlie', 28]]

list_df = pd.DataFrame(my_list, columns=['Name', 'Age'], index=[1, 2, 3])
print(type(list_df))  # Output: <class 'pandas.core.frame.DataFrame'>
list_df

In [None]:
# <--- Creating DataFrame from Tuple --->
my_tuple = (('Alice', 25), ('Bob', 30), ('Charlie', 28))

tuple_df = pd.DataFrame(my_tuple, columns=['Name', 'Age'])
tuple_df

In [None]:
# <--- Creating DataFrame from Dictionary --->
my_dict = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 29, 28]
}

dict_df = pd.DataFrame(my_dict)
dict_df

In [None]:
# <--- Creating DataFrame from List of Dictionaries --->
my_data = [
    {'Name': 'Alice', 'Age': 25, 'City': 'New York'},
    {'Name': 'Bob', 'Age': 30, 'City': 'Paris'},
    {'Name': 'Charlie', 'Age': 28}
]

data_df = pd.DataFrame(my_data)
data_df

In [None]:
# Accessing a column returns a Series
print(type(data_df['Name']))  # Output: <class 'pandas.core.series.Series'>

## Accessing Values from DataFrames

In [None]:
df

In [None]:
# <--- Accessing a single column --->
df['FullName']
type(df['FullName'])  # Output: <class 'pandas.core.series.Series'>

In [None]:
# <--- Accessing a single row using loc --->
# Syntax: df.loc[row_index]
df.loc[0]
type(df.loc[0])  # Output: <class 'pandas.core.series.Series'>

In [None]:
# <--- Accessing multiple rows (using list) --->
df.loc[[2, 3, 19]]

In [None]:
# <--- Accessing multiple rows (using range) --->
df.loc[3:7]

In [None]:
# <--- Accessing a single column using loc --->
# Syntax: df.loc[:, 'column_name']
df.loc[:, 'Python Marks']

In [None]:
# <--- Accessing multiple columns using loc --->
df.loc[:, ['Python Marks', 'Algorithm Marks']]
type(df.loc[:, ['Python Marks', 'Algorithm Marks']])  # Output: <class 'pandas.core.frame.DataFrame'>

In [None]:
# <--- Accessing specific rows and columns --->
# Syntax: df.loc[row_start:row_end, 'column_name']
df.loc[3:7, 'CompletionStatus']

## Changing Index and Columns & Using iloc

In [None]:
# <--- Setting a column as index --->
df_index = df.set_index('StudentID')
df_index

In [None]:
# <--- Using iloc (integer location-based indexing) --->
# iloc uses integer positions (0-based indexing)
df_index.iloc[:, 0:5]

In [None]:
# <--- Renaming columns --->
# inplace=True modifies the original DataFrame
df.rename(columns={'FullName': 'Full Name', 'Algorithm Marks': 'Algo Marks'}, inplace=True)
df

## Modifying DataFrames and Iteration

In [None]:
df

In [None]:
# <--- Deleting a row --->
df.drop(0, inplace=True)
df

In [None]:
# <--- Deleting a column --->
# axis=1 specifies column deletion (axis=0 for rows)
df.drop('Instructor', axis=1, inplace=True)
df

In [None]:
# <--- Modifying specific cell values --->
df.loc[1, 'Python Marks'] = 90
df.loc[1, 'CompletionStatus'] = 'Completed'
df.head(5)

In [None]:
# <--- Modifying multiple rows at once --->
df.loc[1:3, 'Python Marks'] += 2
df.head()

In [None]:
# <--- Iterating through rows (returns index and Series) --->
for i, series in df.iterrows():
    print(f"{i} : {series}")

In [None]:
# <--- Iterating through rows (returns named tuples) --->
# index=False excludes the index from the tuple
for i in df.itertuples(index=False):
    print(i)

## Sorting DataFrames

In [None]:
df

In [None]:
# <--- Sorting by a single column (ascending) --->
copy = df.sort_values('Data Structure Marks')
copy

In [None]:
# <--- Sorting by a single column (descending) --->
copy = df.sort_values(['Data Structure Marks'], ascending=False)
copy

In [None]:
# <--- Sorting by multiple columns (all descending) --->
copy = df.sort_values(['Data Structure Marks', 'Python Marks'], ascending=False)
copy

In [None]:
# <--- Sorting by multiple columns (mixed order) --->
# 0 = descending, 1 = ascending for each column
copy = df.sort_values(['Data Structure Marks', 'Python Marks'], ascending=[0, 1])
copy

## Filtering Data Based on Conditions

In [None]:
df

In [None]:
# <--- Filtering rows based on a condition --->
not_started = df.loc[df['CompletionStatus'] == 'Not Started']
not_started

In [None]:
# <--- Filtering completed students --->
completed = df.loc[df['CompletionStatus'] == 'Completed']
completed

In [None]:
# <--- Filtering with multiple conditions (AND) --->
# Completed students with Data Structure marks >= 90
completed_ds90 = df.loc[(df['CompletionStatus'] == 'Completed') & (df['Data Structure Marks'] >= 90)]
completed_ds90