# Data Analysis with Pandas

## Import and use Pandas

In [1]:
import pandas as pd

# create new series
number_series = pd.Series([1,2,3,4])
print(number_series)

# create new Data Frame
user_data = pd.DataFrame({
    "Name":["Ram","Hari"],
    "Age": [20,23],
    "Address":["Kathmandu","Lalitpur"]
})
print(user_data)

0    1
1    2
2    3
3    4
dtype: int64
   Name  Age    Address
0   Ram   20  Kathmandu
1  Hari   23   Lalitpur


## Read data  from CSV, Excel and JSON file

In [2]:
# read data from csv
student_csv_data = pd.read_csv('data/student.csv')
print(student_csv_data.head())
# read data from csv file
student_excel_data = pd.read_excel('data/student.xlsx')
print(student_excel_data.head())
# read data from json
student_json_data = pd.read_json('data/student.json')
print(student_excel_data.head())

   id        name  class  mark  gender
0   1    John Deo   Four    75  female
1   2    Max Ruin  Three    85    male
2   3      Arnold  Three    55    male
3   4  Krish Star   Four    60  female
4   5   John Mike   Four    60  female
   id        name  class  mark  gender
0   1    John Deo   Four    75  female
1   2    Max Ruin  Three    85    male
2   3      Arnold  Three    55    male
3   4  Krish Star   Four    60  female
4   5   John Mike   Four    60  female
   id        name  class  mark  gender
0   1    John Deo   Four    75  female
1   2    Max Ruin  Three    85    male
2   3      Arnold  Three    55    male
3   4  Krish Star   Four    60  female
4   5   John Mike   Four    60  female


## Save Pandas Dataframe to csv, excel and JSON file

In [3]:
# create new Data Frame
user_data = pd.DataFrame({
    "Name":["Ram","Hari"],
    "Age": [20,23],
    "Address":["Kathmandu","Lalitpur"]
})

user_data.to_csv('data/user_data.csv')
user_data.to_excel('data/user_data.xlsx')
user_data.to_json('data/user_data.json')

## Basic Data Exploration

### Data Inspection

In [None]:
user_data.head()
user_data.tail()
user_data.info()
user_data.describe()
user_data.shape

Unnamed: 0,Age
count,2.0
mean,21.5
std,2.12132
min,20.0
25%,20.75
50%,21.5
75%,22.25
max,23.0


### Handle Missing Data

In [9]:
# axis, 0=row, 1=column
user_data.dropna(axis=0,how='any',subset='Name',inplace=True) # axis=0 default drop rows with NaN  
user_data.drop_duplicates(subset='Name',keep='first',inplace=True) # # Drop duplicates based on the 'Name' column
user_data.fillna(1)
user_data['Name'].fillna('Hari') # ony fill empty value of Name column

0     Ram
1    Hari
Name: Name, dtype: object

# Indexing in Pandas
- ##### Basic Indexing
- ##### Label-based Indexing (.loc[])
- ##### Integer-Label-based Indexing (.iloc[])
- ##### Single Value Access (.at[], .iat[])

## Basic Indexing

In [16]:
import pandas as pd

data = {
    'A': [1, 2, 3, 4],
    'B': [5, 6, 7, 8],
    'C': ['x', 'y', 'z', 'w']
}
df = pd.DataFrame(data)

# Basic indexing
print(df['A'])          # Single column as a Series
print(df[['A', 'B']])   # Multiple columns as a DataFrame

# Slicing rows
print(type(df[:1])) # only first row (dataframe)
print(df[1:3])          # Rows from index 1 to 2 (dataframe)


0    1
1    2
2    3
3    4
Name: A, dtype: int64
   A  B
0  1  5
1  2  6
2  3  7
3  4  8
<class 'pandas.core.frame.DataFrame'>
   A  B  C
1  2  6  y
2  3  7  z


## Label-based Indexing (.loc[])
- .loc[] is used for selecting rows and columns by labels
- We can use single labels, lists of labels, or slices of labels

In [23]:
# Selecting rows based on label
# print(df)
print(df.loc[1])                    # Row with label 1 (index) labels can be string, date etc
print(df.loc[0:2])                  # Rows with labels 0 to 2

# # Selecting specific rows and columns
print(df.loc[0:2, 'A':'C'])         # Rows 0 to 2, columns A to B
print(df.loc[[0, 2], ['A', 'C']])   # Rows 0 and 2, columns A and C


A    2
B    6
C    y
Name: 1, dtype: object
   A  B  C
0  1  5  x
1  2  6  y
2  3  7  z
   A  B  C
0  1  5  x
1  2  6  y
2  3  7  z
   A  C
0  1  x
2  3  z


<!--  -->

## Integer-location Based Indexing
- .iloc[] is used for selecting rows and columns by integer index positions
- Similar to .loc[], we can use single integers, lists of integers, or slices

In [24]:
# Selecting rows based on index positions
print(df.iloc[1])                   # Second row (index 1)
print(df.iloc[0:3])                 # Rows at index positions 0 to 2

# Selecting specific rows and columns
print(df.iloc[0:2, 0:2])            # Rows 0 to 1, columns 0 to 1
print(df.iloc[[0, 2], [0, 2]])      # Rows 0 and 2, columns 0 and 2


A    2
B    6
C    y
Name: 1, dtype: object
   A  B  C
0  1  5  x
1  2  6  y
2  3  7  z
   A  B
0  1  5
1  2  6
   A  C
0  1  x
2  3  z


## Single Value Access (.at[] and .iat[])
- .at[] is used for accessing a single scalar value by label.
- .iat[] is for accessing by index position.

In [25]:
# Using .at[] to access single value by label
print(df.at[1, 'B'])                # Value in row 1, column 'B'

# Using .iat[] to access single value by position
print(df.iat[1, 1])                 # Value in row 1, column 1


6
6


## Manipulating Index

## Summary Functions and Maps

## Grouping and Sorting

## Data Types and Missing Values

## Renaming and Combining