# Pandas

For dealing with datas:
- Analyze
- Clean
- Explore
- Manipulate

In [1]:
import pandas as pd

cars = {
  'cars': ["BMW", "Volvo", "Ford"],
  'passings': [3, 7, 2]
}

df = pd.DataFrame(cars)

df

Unnamed: 0,cars,passings
0,BMW,3
1,Volvo,7
2,Ford,2


In [2]:
pd.__version__

'2.2.3'

## Series

one-dimensional array holding data of any type

In [3]:
arr = [5, 1, 4]

series = pd.Series(arr, index=["x", "y", "z"])

series

x    5
y    1
z    4
dtype: int64

In [4]:
calories = {"day1": 420, "day2": 380, "day3": 390}

series = pd.Series(calories)

series

day1    420
day2    380
day3    390
dtype: int64

In [5]:
series = pd.Series(calories, index=["day1", "day3"])

series

day1    420
day3    390
dtype: int64

## DataFrames

a 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns.

In [6]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

df = pd.DataFrame(data)

df

Unnamed: 0,calories,duration
0,420,50
1,380,40
2,390,45


In [7]:
df.loc[0]

calories    420
duration     50
Name: 0, dtype: int64

In [8]:
df.loc[[0, 1]]

Unnamed: 0,calories,duration
0,420,50
1,380,40


In [9]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

df = pd.DataFrame(data, index=["day1", "day2", "day3"])

print(df, df.loc["day2"])

      calories  duration
day1       420        50
day2       380        40
day3       390        45 calories    380
duration     40
Name: day2, dtype: int64


## Read CSV Files

A simple way to store big data sets is to use CSV files (comma separated files).

In [14]:
df = pd.read_csv("data.csv")

print(df.head(10).to_string())

   Duration  Pulse  Maxpulse  Calories
0        60    110       130     409.1
1        60    117       145     479.0
2        60    103       135     340.0
3        45    109       175     282.4
4        45    117       148     406.0
5        60    102       127     300.0
6        60    110       136     374.0
7        45    104       134     253.3
8        30    109       133     195.1
9        60     98       124     269.0


In [15]:
pd.options.display.max_rows

60

In [17]:
# Increase the maximum number of rows to display the entire DataFrame:
pd.options.display.max_rows = 9999
df = pd.read_csv('data.csv')
# df

## Read JSON Files

Big data sets are often stored, or extracted as JSON.

In [18]:
df = pd.read_json("data.json")

df.head(10)

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0
5,60,102,127,300.5
6,60,110,136,374.0
7,45,104,134,253.3
8,30,109,133,195.1
9,60,98,124,269.0


In [19]:
data = {
  "Duration":{
    "0":60,
    "1":60,
    "2":60,
    "3":45,
    "4":45,
    "5":60
  },
  "Pulse":{
    "0":110,
    "1":117,
    "2":103,
    "3":109,
    "4":117,
    "5":102
  },
  "Maxpulse":{
    "0":130,
    "1":145,
    "2":135,
    "3":175,
    "4":148,
    "5":127
  },
  "Calories":{
    "0":409,
    "1":479,
    "2":340,
    "3":282,
    "4":406,
    "5":300
  }
}

df = pd.DataFrame(data)

df.head(10)

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409
1,60,117,145,479
2,60,103,135,340
3,45,109,175,282
4,45,117,148,406
5,60,102,127,300


## Analyzing DataFrames

In [21]:
df = pd.read_csv("data.csv")

df.head(10)

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0
5,60,102,127,300.0
6,60,110,136,374.0
7,45,104,134,253.3
8,30,109,133,195.1
9,60,98,124,269.0


In [22]:
df.tail(10)

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
159,30,80,120,240.9
160,30,85,120,250.4
161,45,90,130,260.4
162,45,95,130,270.0
163,45,100,140,280.9
164,60,105,140,290.8
165,60,110,145,300.0
166,60,115,145,310.2
167,75,120,150,320.4
168,75,125,150,330.4


In [None]:
df.info()

'''
Empty values, or Null values, can be bad when analyzing data, and you should consider removing rows with empty values.
'''

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  169 non-null    int64  
 1   Pulse     169 non-null    int64  
 2   Maxpulse  169 non-null    int64  
 3   Calories  164 non-null    float64
dtypes: float64(1), int64(3)
memory usage: 5.4 KB
