# Data Analysis with Pandas

### Import and use Pandas

In [32]:
import pandas as pd

# create new series
number_series = pd.Series([1,2,3,4])
print(number_series)

# create new Data Frame
user_data_dict = {
    "Name":["Ram","Hari"],
    "Age": [20,23],
    "Address":["Kathmandu","Lalitpur"]
}
user_data = pd.DataFrame(user_data_dict)
user_data

0    1
1    2
2    3
3    4
dtype: int64


Unnamed: 0,Name,Age,Address
0,Ram,20,Kathmandu
1,Hari,23,Lalitpur


## Read data  from CSV, Excel and JSON file

### Read data from csv

In [33]:
import pandas as pd

student_csv_data = pd.read_csv('data/student.csv')
student_csv_data.head()

Unnamed: 0,id,name,class,mark,gender
0,1,John Deo,Four,75,female
1,2,Max Ruin,Three,85,male
2,3,Arnold,Three,55,male
3,4,Krish Star,Four,60,female
4,5,John Mike,Four,60,female


### Save Pandas Dataframe to csv, excel and JSON file

In [34]:
import pandas as pd

# create new Data Frame
user_data = pd.DataFrame({
    "Name":["Ram","Hari"],
    "Age": [20,23],
    "Address":["Kathmandu","Lalitpur"]
})

user_data.to_csv('./data/user_data.csv')
user_data.to_excel('data/user_data.xlsx')

## Basic Data Exploration

### Data Inspection

In [35]:
import pandas as pd
students = pd.read_csv('data/student.csv')
print(students.shape)
print(students.head()) # print top n=5 rows
# print(students.tail()) # print bottom n=5 rows
print(students.info())
students.describe()

(35, 5)
   id        name  class  mark  gender
0   1    John Deo   Four    75  female
1   2    Max Ruin  Three    85    male
2   3      Arnold  Three    55    male
3   4  Krish Star   Four    60  female
4   5   John Mike   Four    60  female
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      35 non-null     int64 
 1   name    35 non-null     object
 2   class   35 non-null     object
 3   mark    35 non-null     int64 
 4   gender  35 non-null     object
dtypes: int64(2), object(3)
memory usage: 1.5+ KB
None


Unnamed: 0,id,mark
count,35.0,35.0
mean,18.0,74.657143
std,10.246951,16.401117
min,1.0,18.0
25%,9.5,62.5
50%,18.0,79.0
75%,26.5,88.0
max,35.0,96.0


# Data Selection and Indexing in Pandas
- ##### Basic Indexing
- ##### Label-based Indexing (.loc[])
- ##### Integer-Label-based Indexing (.iloc[])
- ##### Single Value Access (.at[], .iat[])

### Basic Indexing

In [36]:
#  
# # Basic indexing
print(students['name'])          # Single column as a Series
new_std_data = students[['id','name','mark']]
print(new_std_data)   # Multiple columns as a DataFrame

# # # Slicing rows
first_10_rows = students[:10]
print(first_10_rows) # only first row (dataframe)
last_five_rows = students[30:].copy()
print(last_five_rows)          # Rows from index 30 to last index (dataframe)


0        John Deo
1        Max Ruin
2          Arnold
3      Krish Star
4       John Mike
5       Alex John
6     My John Rob
7          Asruid
8         Tes Qry
9        Big John
10         Ronald
11          Recky
12            Kty
13           Bigy
14       Tade Row
15          Gimmy
16          Tumyu
17          Honny
18          Tinny
19         Jackly
20     Babby John
21         Reggid
22          Herod
23      Tiddy Now
24       Giff Tow
25         Crelea
26       Big Nose
27      Rojj Base
28    Tess Played
29      Reppy Red
30    Marry Toeey
31      Binn Rott
32      Kenn Rein
33       Gain Toe
34     Rows Noump
Name: name, dtype: object
    id         name  mark
0    1     John Deo    75
1    2     Max Ruin    85
2    3       Arnold    55
3    4   Krish Star    60
4    5    John Mike    60
5    6    Alex John    55
6    7  My John Rob    78
7    8       Asruid    85
8    9      Tes Qry    78
9   10     Big John    55
10  11       Ronald    89
11  12        Recky    94
12  13

### Label-based Indexing (.loc[])
- .loc[] is used for selecting rows and columns by labels
- We can use single labels, lists of labels, or slices of labels

In [37]:
# Selecting rows based on label
students = pd.read_csv('data/student.csv')
first_row = students.loc[1]
first_two_rows = students.loc[0:1]
print(first_row)                    # Row with label 1 (index) labels can be string, date etc
print(first_two_rows)                  # Rows with labels 0 to 1

# # Selecting specific rows to columns
first_five_stds_with_name_to_marks = students.loc[0:4,'name':'mark']
print(first_five_stds_with_name_to_marks)         # Rov ws 0 to 2, columns name to mark

# select only particular rows and columns
first_and_second_row_with_name_and_gender = students.loc[[5, 20], ['name', 'gender']]
# Rows 6th and 21th, columns name and gender
first_and_second_row_with_name_and_gender


id               2
name      Max Ruin
class        Three
mark            85
gender        male
Name: 1, dtype: object
   id      name  class  mark  gender
0   1  John Deo   Four    75  female
1   2  Max Ruin  Three    85    male
         name  class  mark
0    John Deo   Four    75
1    Max Ruin  Three    85
2      Arnold  Three    55
3  Krish Star   Four    60
4   John Mike   Four    60


Unnamed: 0,name,gender
5,Alex John,male
20,Babby John,female


<!--  -->

### Integer-location Based Indexing
- .iloc[] is **used** for selecting rows and columns by integer index positions
- Similar to .loc[], we can use single integers, lists of integers, or slices

In [38]:
# Selecting rows based on index positions
print(students.iloc[1])                   # Second row (index 1)
print(students.iloc[0:3])                 # Rows at index positions 0 to 2

# Selecting specific rows and columns
print(students.iloc[0:2, 0:2])            # Rows 0 to 1, columns 0 to 1
print(students.iloc[[0, 2], [0, 2]])      # Rows 0 and 1, columns 0 and 1


id               2
name      Max Ruin
class        Three
mark            85
gender        male
Name: 1, dtype: object
   id      name  class  mark  gender
0   1  John Deo   Four    75  female
1   2  Max Ruin  Three    85    male
2   3    Arnold  Three    55    male
   id      name
0   1  John Deo
1   2  Max Ruin
   id  class
0   1   Four
2   3  Three


### Single Value Access (.at[] and .iat[])
- .at[] is used for accessing a single scalar value by label.
- .iat[] is for accessing by index position.

In [39]:
# Using .at[] to access single value by label
print(students.at[1, 'name'])                # Value in row 2, column 'name'

# Using .iat[] to access single value by position
print(students.iat[5, 1])                 # Value in row 6, column 1


Max Ruin
Alex John


# Filtering
- Logical operators (>, <, ==,  !=)
- Multiple logical operators (& (AND) and | (OR))
- Str ancessor, Isin, contains, between, startWith
- Tilde (~)
- Query
- Nlargest and nsmallest
- Loc and iloc

### Logical Operators

In [40]:
# students with marks less than 40
stds_marks_less_than_40 = students[students.mark<40]
print(stds_marks_less_than_40.head())

boys = students[students.gender=='male']

boys.head()


    id   name class  mark gender
18  19  Tinny  Nine    18   male


Unnamed: 0,id,name,class,mark,gender
1,2,Max Ruin,Three,85,male
2,3,Arnold,Three,55,male
5,6,Alex John,Four,55,male
6,7,My John Rob,Fifth,78,male
7,8,Asruid,Five,85,male


### Multiple Logical Opertors

In [41]:
#  students  of class 'four' and have marks above 80
class_four_high_marks = students[ (students['class'] == 'Four')  & (students['mark'] > 80) ]
class_four_high_marks.head()

# students of class 'four' and 'Five' and have marks above 70
stds_of_fout_five = students[( ( students['class'] =='Four' ) | (students['class']=='Five') )  & (students['mark']>70)]
stds_of_fout_five

Unnamed: 0,id,name,class,mark,gender
0,1,John Deo,Four,75,female
7,8,Asruid,Five,85,male
14,15,Tade Row,Four,88,male
15,16,Gimmy,Four,88,male
17,18,Honny,Five,75,male
30,31,Marry Toeey,Four,88,male


### str,Isin, contains, between, startWith

In [42]:
# students whose name starts with 'J'
stds_with_name_j = students[students.name.str.startswith('J')]
stds_with_name_j.head()

stds_with_name_gimmy = students[students.name.str.contains('Gimmy')]
print(stds_with_name_gimmy)
# students with marks between 40 and 50
stds_with_name_j = students[students.mark.between(40,50)]
stds_with_name_j.head()

# students of class Nine or Ten
class_nine_or_ten = students[students['class'].isin(['Nine', 'Ten'])]
class_nine_or_ten

    id   name class  mark gender
15  16  Gimmy  Four    88   male


Unnamed: 0,id,name,class,mark,gender
18,19,Tinny,Nine,18,male
19,20,Jackly,Nine,65,female


### Tilde(~)

In [43]:
# Filter students who are NOT in class 'Nine'
not_class_nine = students[~students['class'].isin(['Nine'])]
not_class_nine.head()

Unnamed: 0,id,name,class,mark,gender
0,1,John Deo,Four,75,female
1,2,Max Ruin,Three,85,male
2,3,Arnold,Three,55,male
3,4,Krish Star,Four,60,female
4,5,John Mike,Four,60,female


### Query

In [44]:
# Filter students with marks above 85 and gender is 'male'
query_filter = students.query("mark > 85 and gender == 'male'")
query_filter.head()

Unnamed: 0,id,name,class,mark,gender
14,15,Tade Row,Four,88,male
15,16,Gimmy,Four,88,male
24,25,Giff Tow,Seven,88,male
30,31,Marry Toeey,Four,88,male


### nlargest and nsmallest
Use nlargest and nsmallest to get rows with the top or bottom values in a column

In [45]:
# Get top 3 students with the highest marks
top_3_marks = students.nlargest(3, 'mark')
print(top_3_marks)
# Get 2 students with the lowest marks
bottom_2_marks = students.nsmallest(2, 'mark')
bottom_2_marks

    id       name  class  mark  gender
32  33  Kenn Rein    Six    96  female
11  12      Recky    Six    94  female
31  32  Binn Rott  Seven    90  female


Unnamed: 0,id,name,class,mark,gender
18,19,Tinny,Nine,18,male
16,17,Tumyu,Six,54,male


### loc and iloc

In [46]:
# Using `loc` to select rows with IDs greater than 2
loc_filter = students.loc[students['id']> 2]
print(loc_filter.head())
# # Using `iloc` to select the first 3 rows
# iloc_filter = students.iloc[:3]


   id         name  class  mark  gender
2   3       Arnold  Three    55    male
3   4   Krish Star   Four    60  female
4   5    John Mike   Four    60  female
5   6    Alex John   Four    55    male
6   7  My John Rob  Fifth    78    male


# Data Transformation and Mapping
- **apply**: allows us to apply a function to each element, row, or column in a DataFrame or Series
- **map**: is  used to map values in a Series according to a dictionary or another Series
- **replace**: replace allows for replacing specific values in the DataFrame with new value
- **astype**: this is used to convert data type
- **pipe**: it allows chaining and using complex functions that operate on the entire DataFrame. This can be used for complex transformation operation

### apply

In [47]:
# double the marks of the student using apply and lambda function
students['double_mark'] = students['mark'].apply(lambda m: m * 2)
students.head()

Unnamed: 0,id,name,class,mark,gender,double_mark
0,1,John Deo,Four,75,female,150
1,2,Max Ruin,Three,85,male,170
2,3,Arnold,Three,55,male,110
3,4,Krish Star,Four,60,female,120
4,5,John Mike,Four,60,female,120


### map


In [48]:
# Replace the 'male' female with 1 or 0
gender_map = {'male': 1, 'female': 0}
students['gender_code'] = students['gender'].map(gender_map)
students.head()

Unnamed: 0,id,name,class,mark,gender,double_mark,gender_code
0,1,John Deo,Four,75,female,150,0
1,2,Max Ruin,Three,85,male,170,1
2,3,Arnold,Three,55,male,110,1
3,4,Krish Star,Four,60,female,120,0
4,5,John Mike,Four,60,female,120,0


### replace

In [49]:
# Replace specific values in the 'class' column
students['class'] = students['class'].replace({'Four': '4th', 'Ten': '10th', 'Eight': '8th'})
students.head()

Unnamed: 0,id,name,class,mark,gender,double_mark,gender_code
0,1,John Deo,4th,75,female,150,0
1,2,Max Ruin,Three,85,male,170,1
2,3,Arnold,Three,55,male,110,1
3,4,Krish Star,4th,60,female,120,0
4,5,John Mike,4th,60,female,120,0


### astype

In [50]:
# Convert 'mark' column to float
students['mark'] = students['mark'].astype(float)
students.head()

Unnamed: 0,id,name,class,mark,gender,double_mark,gender_code
0,1,John Deo,4th,75.0,female,150,0
1,2,Max Ruin,Three,85.0,male,170,1
2,3,Arnold,Three,55.0,male,110,1
3,4,Krish Star,4th,60.0,female,120,0
4,5,John Mike,4th,60.0,female,120,0


### pipe

In [51]:
# create a function that triples marks and pipes it to the pandas DataFrame
def triple_marks(df):
    df['mark'] = df['mark'] * 3
    return df

students = students.pipe(triple_marks)
students.head()

Unnamed: 0,id,name,class,mark,gender,double_mark,gender_code
0,1,John Deo,4th,225.0,female,150,0
1,2,Max Ruin,Three,255.0,male,170,1
2,3,Arnold,Three,165.0,male,110,1
3,4,Krish Star,4th,180.0,female,120,0
4,5,John Mike,4th,180.0,female,120,0


# Data Cleaning and Manipulation
- Handle Missing Data
- Handle Duplicates
- One-Hot-Encoding
- Normalization 

![Benefits of Data Cleaning](assets/data-cleaning.png)


### Handle Missing Data
- Remove Rows Containing Missing Values ``(df.dropna())``
    - ``dropna(axis=0)`` removes rows containing at least one missing value
    - ``(dropna(axis=1))`` Removes columns containing at least one missing value 
- Replace Missing Values ``(df.fillna())``
    - **Replace with specific value**: ``(df.fillna(value))`` fill the missing values with the previous row value of column value (if axis=1)
    - **Backward Fill**: ``(df.fillna(value,method='ffill'))`` replaces the missing values with the next value
    - **Fill individual column with specified value**: ``(df.fillna({'height':150,'weight:60})`` replaces the 
    missing values of column height with 150 and column weight with 60 
    - **Fill with average, Interpolation etc.**: Fill the missing value using average, or using interpolation


In [None]:
from IPython.display import display
# axis, 0=row, 1=column
hw_data = pd.read_csv('./data/weight-height.csv')
display(hw_data.head())
hw_data.dropna(axis=0,how='any',inplace=True) # (axis=0 default) drop rows with NaN  
hw_data.fillna(1)  # replace missing values with 1
hw_data.fillna(method='ffill') # fill with previous row value
hw_data.fillna(method='bfill') # fill with next row
hw_data['Weight'].fillna(150) # ony fill empty value of Name column
hw_data.fillna({'Height':150,'Weight':60})
df_interpolated = hw_data.interpolate()
hw_data.fillna(df_interpolated)
hw_data['Weight'].fillna(hw_data['Weight'].mean())

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


  hw_data.fillna(method='ffill') # fill with previous row value
  hw_data.fillna(method='bfill') # fill with next row
  df_interpolated = hw_data.interpolate()


0       241.893563
1       162.310473
2       212.740856
3       220.042470
4       206.349801
           ...    
9995    136.777454
9996    170.867906
9997    128.475319
9998    163.852461
9999    113.649103
Name: Weight, Length: 10000, dtype: float64

### Handle Duplicate Data
- Check Duplicates
- Drop Duplicates

In [None]:
duplicates = hw_data.duplicated()
hw_data.drop_duplicates()
duplicates

0       False
1       False
2       False
3       False
4       False
        ...  
9995    False
9996    False
9997    False
9998    False
9999    False
Length: 10000, dtype: bool

: 

# Grouping & Aggregate

### Grouping
- Groping consists of three operations
    - Split
    - Apply
    - Combine
    
![Grouping](./assets/grouping.png)

# Visualization