# Data Analysis with Pandas

### Import and use Pandas

In [1]:
import pandas as pd

# create new series
number_series = pd.Series([1,2,3,4])
print(number_series)

# create new Data Frame
user_data_dict = {
    "Name":["Ram","Hari"],
    "Age": [20,23],
    "Address":["Kathmandu","Lalitpur"]
}
user_data = pd.DataFrame(user_data_dict)
user_data

0    1
1    2
2    3
3    4
dtype: int64


Unnamed: 0,Name,Age,Address
0,Ram,20,Kathmandu
1,Hari,23,Lalitpur


## Read data  from CSV, Excel and JSON file

### Read data from csv

In [2]:
import pandas as pd

student_csv_data = pd.read_csv('data/student.csv')
student_csv_data.head()

Unnamed: 0,id,name,class,mark,gender
0,1,John Deo,Four,75,female
1,2,Max Ruin,Three,85,male
2,3,Arnold,Three,55,male
3,4,Krish Star,Four,60,female
4,5,John Mike,Four,60,female


### Save Pandas Dataframe to csv, excel and JSON file

In [3]:
import pandas as pd

# create new Data Frame
user_data = pd.DataFrame({
    "Name":["Ram","Hari"],
    "Age": [20,23],
    "Address":["Kathmandu","Lalitpur"]
})

user_data.to_csv('./data/user_data.csv')
user_data.to_excel('data/user_data.xlsx')

## Basic Data Exploration

### Data Inspection

In [4]:
import pandas as pd
students = pd.read_csv('data/student.csv')
print(students.shape)
print(students.head()) # print top n=5 rows
# print(students.tail()) # print bottom n=5 rows
print(students.info())
students.describe()

(35, 5)
   id        name  class  mark  gender
0   1    John Deo   Four    75  female
1   2    Max Ruin  Three    85    male
2   3      Arnold  Three    55    male
3   4  Krish Star   Four    60  female
4   5   John Mike   Four    60  female
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      35 non-null     int64 
 1   name    35 non-null     object
 2   class   35 non-null     object
 3   mark    35 non-null     int64 
 4   gender  35 non-null     object
dtypes: int64(2), object(3)
memory usage: 1.5+ KB
None


Unnamed: 0,id,mark
count,35.0,35.0
mean,18.0,74.657143
std,10.246951,16.401117
min,1.0,18.0
25%,9.5,62.5
50%,18.0,79.0
75%,26.5,88.0
max,35.0,96.0


# Data Selection and Indexing in Pandas
- ##### Basic Indexing
- ##### Label-based Indexing (.loc[])
- ##### Integer-Label-based Indexing (.iloc[])
- ##### Single Value Access (.at[], .iat[])

### Basic Indexing

In [5]:
#  
# # Basic indexing
print(students['name'])          # Single column as a Series
new_std_data = students[['id','name','mark']]
print(new_std_data)   # Multiple columns as a DataFrame

# # # Slicing rows
first_10_rows = students[:10]
print(first_10_rows) # only first row (dataframe)
last_five_rows = students[30:].copy()
print(last_five_rows)          # Rows from index 30 to last index (dataframe)


0        John Deo
1        Max Ruin
2          Arnold
3      Krish Star
4       John Mike
5       Alex John
6     My John Rob
7          Asruid
8         Tes Qry
9        Big John
10         Ronald
11          Recky
12            Kty
13           Bigy
14       Tade Row
15          Gimmy
16          Tumyu
17          Honny
18          Tinny
19         Jackly
20     Babby John
21         Reggid
22          Herod
23      Tiddy Now
24       Giff Tow
25         Crelea
26       Big Nose
27      Rojj Base
28    Tess Played
29      Reppy Red
30    Marry Toeey
31      Binn Rott
32      Kenn Rein
33       Gain Toe
34     Rows Noump
Name: name, dtype: object
    id         name  mark
0    1     John Deo    75
1    2     Max Ruin    85
2    3       Arnold    55
3    4   Krish Star    60
4    5    John Mike    60
5    6    Alex John    55
6    7  My John Rob    78
7    8       Asruid    85
8    9      Tes Qry    78
9   10     Big John    55
10  11       Ronald    89
11  12        Recky    94
12  13

### Label-based Indexing (.loc[])
- .loc[] is used for selecting rows and columns by labels
- We can use single labels, lists of labels, or slices of labels

In [6]:
# Selecting rows based on label
students = pd.read_csv('data/student.csv')
first_row = students.loc[1]
first_two_rows = students.loc[0:1]
print(first_row)                    # Row with label 1 (index) labels can be string, date etc
print(first_two_rows)                  # Rows with labels 0 to 1

# # Selecting specific rows to columns
first_five_stds_with_name_to_marks = students.loc[0:4,'name':'mark']
print(first_five_stds_with_name_to_marks)         # Rov ws 0 to 2, columns name to mark

# select only particular rows and columns
first_and_second_row_with_name_and_gender = students.loc[[5, 20], ['name', 'gender']]
# Rows 6th and 21th, columns name and gender
first_and_second_row_with_name_and_gender


id               2
name      Max Ruin
class        Three
mark            85
gender        male
Name: 1, dtype: object
   id      name  class  mark  gender
0   1  John Deo   Four    75  female
1   2  Max Ruin  Three    85    male
         name  class  mark
0    John Deo   Four    75
1    Max Ruin  Three    85
2      Arnold  Three    55
3  Krish Star   Four    60
4   John Mike   Four    60


Unnamed: 0,name,gender
5,Alex John,male
20,Babby John,female


<!--  -->

### Integer-location Based Indexing
- .iloc[] is **used** for selecting rows and columns by integer index positions
- Similar to .loc[], we can use single integers, lists of integers, or slices

In [7]:
# Selecting rows based on index positions
print(students.iloc[1])                   # Second row (index 1)
print(students.iloc[0:3])                 # Rows at index positions 0 to 2

# Selecting specific rows and columns
print(students.iloc[0:2, 0:2])            # Rows 0 to 1, columns 0 to 1
print(students.iloc[[0, 2], [0, 2]])      # Rows 0 and 1, columns 0 and 1


id               2
name      Max Ruin
class        Three
mark            85
gender        male
Name: 1, dtype: object
   id      name  class  mark  gender
0   1  John Deo   Four    75  female
1   2  Max Ruin  Three    85    male
2   3    Arnold  Three    55    male
   id      name
0   1  John Deo
1   2  Max Ruin
   id  class
0   1   Four
2   3  Three


### Single Value Access (.at[] and .iat[])
- .at[] is used for accessing a single scalar value by label.
- .iat[] is for accessing by index position.

In [8]:
# Using .at[] to access single value by label
print(students.at[1, 'name'])                # Value in row 2, column 'name'

# Using .iat[] to access single value by position
print(students.iat[5, 1])                 # Value in row 6, column 1


Max Ruin
Alex John


# Filtering
- Logical operators (>, <, ==,  !=)
- Multiple logical operators (& (AND) and | (OR))
- Str ancessor, Isin, contains, between, startWith
- Tilde (~)
- Query
- Nlargest and nsmallest
- Loc and iloc

### Logical Operators

In [9]:
# students with marks less than 40
stds_marks_less_than_40 = students[students.mark<40]
print(stds_marks_less_than_40.head())

boys = students[students.gender=='male']

boys.head()


    id   name class  mark gender
18  19  Tinny  Nine    18   male


Unnamed: 0,id,name,class,mark,gender
1,2,Max Ruin,Three,85,male
2,3,Arnold,Three,55,male
5,6,Alex John,Four,55,male
6,7,My John Rob,Fifth,78,male
7,8,Asruid,Five,85,male


### Multiple Logical Opertors

In [10]:
#  students  of class 'four' and have marks above 80
class_four_high_marks = students[ (students['class'] == 'Four')  & (students['mark'] > 80) ]
class_four_high_marks.head()

# students of class 'four' and 'Five' and have marks above 70
stds_of_fout_five = students[( ( students['class'] =='Four' ) | (students['class']=='Five') )  & (students['mark']>70)]
stds_of_fout_five

Unnamed: 0,id,name,class,mark,gender
0,1,John Deo,Four,75,female
7,8,Asruid,Five,85,male
14,15,Tade Row,Four,88,male
15,16,Gimmy,Four,88,male
17,18,Honny,Five,75,male
30,31,Marry Toeey,Four,88,male


### str,Isin, contains, between, startWith

In [11]:
# students whose name starts with 'J'
stds_with_name_j = students[students.name.str.startswith('J')]
stds_with_name_j.head()

stds_with_name_gimmy = students[students.name.str.contains('Gimmy')]
print(stds_with_name_gimmy)
# students with marks between 40 and 50
stds_with_name_j = students[students.mark.between(40,50)]
stds_with_name_j.head()

# students of class Nine or Ten
class_nine_or_ten = students[students['class'].isin(['Nine', 'Ten'])]
class_nine_or_ten

    id   name class  mark gender
15  16  Gimmy  Four    88   male


Unnamed: 0,id,name,class,mark,gender
18,19,Tinny,Nine,18,male
19,20,Jackly,Nine,65,female


### Tilde(~)

In [12]:
# Filter students who are NOT in class 'Nine'
not_class_nine = students[~students['class'].isin(['Nine'])]
not_class_nine.head()

Unnamed: 0,id,name,class,mark,gender
0,1,John Deo,Four,75,female
1,2,Max Ruin,Three,85,male
2,3,Arnold,Three,55,male
3,4,Krish Star,Four,60,female
4,5,John Mike,Four,60,female


### Query

In [13]:
# Filter students with marks above 85 and gender is 'male'
query_filter = students.query("mark > 85 and gender == 'male'")
query_filter.head()

Unnamed: 0,id,name,class,mark,gender
14,15,Tade Row,Four,88,male
15,16,Gimmy,Four,88,male
24,25,Giff Tow,Seven,88,male
30,31,Marry Toeey,Four,88,male


### nlargest and nsmallest
Use nlargest and nsmallest to get rows with the top or bottom values in a column

In [14]:
# Get top 3 students with the highest marks
top_3_marks = students.nlargest(3, 'mark')
print(top_3_marks)
# Get 2 students with the lowest marks
bottom_2_marks = students.nsmallest(2, 'mark')
bottom_2_marks

    id       name  class  mark  gender
32  33  Kenn Rein    Six    96  female
11  12      Recky    Six    94  female
31  32  Binn Rott  Seven    90  female


Unnamed: 0,id,name,class,mark,gender
18,19,Tinny,Nine,18,male
16,17,Tumyu,Six,54,male


### loc and iloc

In [15]:
# Using `loc` to select rows with IDs greater than 2
loc_filter = students.loc[students['id']> 2]
print(loc_filter.head())
# # Using `iloc` to select the first 3 rows
# iloc_filter = students.iloc[:3]


   id         name  class  mark  gender
2   3       Arnold  Three    55    male
3   4   Krish Star   Four    60  female
4   5    John Mike   Four    60  female
5   6    Alex John   Four    55    male
6   7  My John Rob  Fifth    78    male


# Data Transformation and Mapping
- **apply**: allows us to apply a function to each element, row, or column in a DataFrame or Series
- **map**: is  used to map values in a Series according to a dictionary or another Series
- **replace**: replace allows for replacing specific values in the DataFrame with new value
- **astype**: this is used to convert data type
- **pipe**: it allows chaining and using complex functions that operate on the entire DataFrame. This can be used for complex transformation operation

### apply

In [1]:
# convert marks to scale of 1 
import pandas as pd
students = pd.read_csv('data/student.csv')
students['mark_percent'] = students['mark'].apply(lambda m: m /100)
students.head()

Unnamed: 0,id,name,class,mark,gender,mark_percent
0,1,John Deo,Four,75.0,female,0.75
1,2,Max Ruin,Three,85.0,male,0.85
2,3,Arnold,Three,55.0,male,0.55
3,4,Krish Star,Four,60.0,female,0.6
4,5,John Mike,Four,60.0,female,0.6


### map


In [5]:
import pandas as pd
students = pd.read_csv('./data/student.csv')
students['grade'] = students['class'].map({'One':1,'Two':2,'Three':3,'Four':4,
                                           'Five':5,'Six':6,'Seven':7,
                                           'Eight':8,'Nine':9,'Ten':10}) 

students['gender_code'] = students['gender'].map({'male':1,'female':2})
students.head()

Unnamed: 0,id,name,class,mark,gender,grade,gender_code
0,1,John Deo,Four,,female,4.0,2.0
1,2,Max Ruin,Three,85.0,male,3.0,1.0
2,3,Arnold,Three,55.0,male,3.0,1.0
3,4,Krish Star,Four,60.0,,4.0,
4,5,John Mike,Four,60.0,female,4.0,2.0


### replace

In [6]:

# Replace specific values in the 'class' column
students['class'] = students['class'].replace({'Four': '4th', 'Ten': '10th', 'Eight': '8th'})
# replace name 'John Doe' with 'JaiRam' (replace specific value)
students['name'] = students['name'].replace({'John Deo':'JaiRam'})
students.head()

Unnamed: 0,id,name,class,mark,gender,grade,gender_code
0,1,JaiRam,4th,,female,4.0,2.0
1,2,Max Ruin,Three,85.0,male,3.0,1.0
2,3,Arnold,Three,55.0,male,3.0,1.0
3,4,Krish Star,4th,60.0,,4.0,
4,5,John Mike,4th,60.0,female,4.0,2.0


### astype

In [9]:
# Convert 'grade' column to int
students['mark'] = students['mark'].astype(float)
students.head()

Unnamed: 0,id,name,class,mark,gender,grade,gender_code
0,1,JaiRam,4th,,female,4.0,2.0
1,2,Max Ruin,Three,85.0,male,3.0,1.0
2,3,Arnold,Three,55.0,male,3.0,1.0
3,4,Krish Star,4th,60.0,,4.0,
4,5,John Mike,4th,60.0,female,4.0,2.0


### pipe

In [None]:
 
def check_result(df):
    df['result'] = df['mark'].apply(lambda m : 'Pass' if m>40 else 'Fail')  
    return df

students = students.pipe(check_result)
students.tail() # last 5 items

Unnamed: 0,id,name,class,mark,gender,grade,gender_code,result
30,31,Marry Toeey,4th,,male,4.0,1.0,Fail
31,32,Binn Rott,Seven,90.0,female,7.0,2.0,Pass
32,33,Kenn Rein,Six,96.0,female,6.0,2.0,Pass
33,34,Gain Toe,Seven,69.0,male,7.0,1.0,Pass
34,35,Rows Noump,Six,88.0,female,6.0,2.0,Pass


# Data Cleaning and Manipulation
- Handle Missing Data
- Handle Duplicates
- One-Hot-Encoding
- Normalization 

![Benefits of Data Cleaning](assets/data-cleaning.png)


### Handle Missing Data
- Remove Rows Containing Missing Values ``(df.dropna())``
    - ``dropna(axis=0)`` removes rows containing at least one missing value
    - ``(dropna(axis=1))`` Removes columns containing at least one missing value 
- Replace Missing Values ``(df.fillna())``
    - **Replace with specific value**: ``(df.fillna(value))`` 
    - **Backward Fill, Forward Fill**: ``(df.fillna(value,method='bfill')) (df.fillna(value,method='ffill'))`` 
    - **Fill individual column with specified value**: ``(df.fillna({'height':150,'weight:60})`` 
    - **Fill with average, Interpolation etc.**: 


In [None]:
from IPython.display import display

students = pd.read_csv('data/student.csv')
display(students.head())
students.dropna(axis=0,how='any',inplace=True) # (axis=0 default) drop rows with NaN  

students =students.fillna(1)  # replace missing values with 1
display(students.head())

students.fillna(method='ffill') # fill with previous row value
students.fillna(method='bfill') # fill with next row
students['mark'].fillna(50) # ony fill empty value of Name column
students.fillna({'mark':50,'gender':'Male'}) # replace mark with 50 if its empty and gender with Male
df_interpolated = students.interpolate()
students.fillna(df_interpolated)
students['mark'].fillna (students['mark'].mean())

Unnamed: 0,id,name,class,mark,gender
0,1,John Deo,Four,,female
1,2,Max Ruin,Three,85.0,male
2,3,Arnold,Three,55.0,male
3,4,Krish Star,Four,60.0,
4,5,John Mike,Four,60.0,


### Handle Duplicate Data
- Check Duplicates
- Drop Duplicates

In [None]:
duplicates = students.duplicated() # check duplicate
students.drop_duplicates() # drop duplicate
duplicates

0       False
1       False
2       False
3       False
4       False
        ...  
9995    False
9996    False
9997    False
9998    False
9999    False
Length: 10000, dtype: bool

### One-Hot-Encoding
- One Hot Encoding is a method for converting categorical variables into a binary format. It creates new binary columns (0s and 1s) for each category in the original variable. Each category in the original column is represented as a separate column, where a value of 1 indicates the presence of that category, and 0 indicates its absence
For example we replace the country name with numerical value
e.g. America = 1,
India=2
Nepal=3

In [None]:

students = pd.read_csv('data/student.csv')
df_pandas_encoded = pd.get_dummies(students, columns=['class'])
display(df_pandas_encoded)



One-Hot Encoded Data using Pandas:



Unnamed: 0,id,name,mark,gender,class_Eight,class_Fifth,class_Five,class_Four,class_Nine,class_Seven,class_Six,class_Three
0,1,John Deo,75.0,female,False,False,False,True,False,False,False,False
1,2,Max Ruin,85.0,male,False,False,False,False,False,False,False,True
2,3,Arnold,55.0,male,False,False,False,False,False,False,False,True
3,4,Krish Star,60.0,female,False,False,False,True,False,False,False,False
4,5,John Mike,60.0,female,False,False,False,True,False,False,False,False
5,6,Alex John,55.0,male,False,False,False,True,False,False,False,False
6,7,My John Rob,78.0,male,False,True,False,False,False,False,False,False
7,8,Asruid,85.0,male,False,False,True,False,False,False,False,False
8,9,Tes Qry,78.0,male,False,False,False,False,False,False,True,False
9,10,Big John,55.0,,False,False,False,True,False,False,False,False


### Data Normalization 
Data normalization involves adjusting measurement values of different scales to a common scale. Normalization is only applicable to numerical columns.

![Normalization Techniques](assets/normalization-technique.png)

There are five common normalization methods:

1. Single feature scaling
2. Min-max scaling
3. Z-score normalization
4. Log scaling
5. Clipping

### Single Feature Scaling
Single feature scaling transforms each value in a column into a number between 0 and 1
### Min-Max Scaling
This scales the data to a specific range, typically [0, 1]. The formula is:
x_new = x- min(x) / (max(x)-min(x))
​
###  Z-score Normalization
This normalization method transforms the data so that it has a mean of 0 and a standard deviation of 1. 

x_new =( x - μ) / σ 

Where: μ = mean and σ  = SD
 
 ### Log Scaling
 This method uses the logarithm to scale the data, which is useful when dealing with data that has a large range or is heavily skewed

 x_new = log(x+1)

 ### Clipping
 Clipping is a technique to limit the range of data by setting lower and upper bounds. It’s useful for handling outliers.

In [2]:
import numpy as np
import pandas as pd
products = pd.read_csv('data/product-data.csv')
# Single Feature Scaling
products['Sales_Single_Scale'] = products['Sales'] / products['Sales'].max()

# log
products['Sales_Log_Scale'] = np.log1p(products['Sales'])

# clipping
products['Discount_Clipped'] = products['Discount_Percentage'].clip(lower=5, upper=15)

products.head()

Unnamed: 0,Product_ID,Sales,Price,Customer_Rating,Discount_Percentage,Sales_Single_Scale,Sales_Log_Scale,Discount_Clipped
0,1,500,200,4.2,10,0.5,6.216606,10
1,2,700,450,3.9,15,0.7,6.552508,15
2,3,800,300,4.5,12,0.8,6.685861,12
3,4,900,350,4.8,8,0.9,6.803505,8
4,5,1000,500,4.0,5,1.0,6.908755,5


# Grouping & Aggregate

- Groping consists of three operations
    - Split
    - Apply
    - Combine

![Grouping](./assets/grouping.png)    


### Group By

In [3]:
import numpy as np
from IPython.display import display
import pandas as pd

students = pd.read_csv('data/student.csv')

# group data based on Gender
gender_data = students.groupby('gender')
display(gender_data.count())
gender_data['mark'].mean() #  
# standard deviation (std())

group_by_class = students.groupby('class')
display(group_by_class['mark'].mean())

Unnamed: 0_level_0,id,name,class,mark
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,14,14,14,11
male,18,18,18,15


class
Eight          NaN
Fifth    78.000000
Five     80.000000
Four     67.857143
Nine     41.500000
Seven    75.000000
Six      87.333333
Three    73.666667
Name: mark, dtype: float64

### Aggregation
- count() – Number of non-null observations
- sum() – Sum of values
- mean() – Mean of values
- median() – Arithmetic median of values
- min() – Minimum
- max() – Maximum
- mode() – Mode
- std() – Standard deviation
- var() – Variance

# Sorting

### Sort By Index

### Sort By Value
- Ascending
- Descending
- Sorting by Date
- Sorting with key Function

In [7]:
students = pd.read_csv('data/student.csv')
display(students.head())
new_sorted_data = students.sort_values(by=['name','mark'],ascending=True)
display(new_sorted_data.head())

Unnamed: 0,id,name,class,mark,gender
0,1,John Deo,Four,,female
1,2,Max Ruin,Three,85.0,male
2,3,Arnold,Three,55.0,male
3,4,Krish Star,Four,60.0,
4,5,John Mike,Four,60.0,


Unnamed: 0,id,name,class,mark,gender
5,6,Alex John,Four,55.0,male
2,3,Arnold,Three,55.0,male
7,8,Asruid,Five,85.0,male
20,21,Babby John,Four,69.0,female
9,10,Big John,Four,55.0,


### Sorting By Date

If we want sort data based on datetime we need to convert the datetime to datetime object

In [8]:

data = pd.read_csv('data/student-admission.csv')

# convert to datetime
data['AdmissionDate']  = pd.to_datetime(data['AdmissionDate'])

# sort by AdminssionDate
sorted_by_date = data.sort_values(by=['AdmissionDate','Name'],ascending=True)
sorted_by_date



Unnamed: 0.1,Unnamed: 0,AdmissionDate,StudentID,Name,Stream
5,5,2021-01-17,1,Abhinav,IT
3,3,2021-01-18,2,Sohan,Mechanical
2,2,2021-01-20,3,Mohan,Civil
6,6,2021-01-21,4,Danny,EEE
4,4,2021-01-22,6,Lucky,CSE
1,1,2021-01-22,5,Shyam,ECE
0,0,2021-01-25,7,Ram,CSE


### Sorting with Key Function

In [137]:
sorted_key_data = data.sort_values(by='Name', key=lambda x: x.str.len(),ascending=False)
sorted_key_data

Unnamed: 0,AdmissionDate,StudentID,Name,Stream
5,2021-01-17,1,Abhinav,IT
1,2021-01-22,5,Shyam,ECE
2,2021-01-20,3,Mohan,Civil
3,2021-01-18,2,Sohan,Mechanical
4,2021-01-22,6,Lucky,CSE
6,2021-01-21,4,Danny,EEE
0,2021-01-25,7,Ram,CSE
