# Introduction to Pandas

In [1]:
import pandas as pd

## Series

In [2]:
height_list = [1.5, 1.66, 1.8, 1.3]

In [3]:
heights = pd.Series(height_list)

In [4]:
heights

0    1.50
1    1.66
2    1.80
3    1.30
dtype: float64

In [5]:
heights[0]

np.float64(1.5)

In [6]:
heights[1]

np.float64(1.66)

In [7]:
heights[:3]

0    1.50
1    1.66
2    1.80
dtype: float64

In [8]:
heights.values

array([1.5 , 1.66, 1.8 , 1.3 ])

In [9]:
heights.index

RangeIndex(start=0, stop=4, step=1)

In [10]:
patient = ["A", "B", "C", "D"]

In [11]:
heights.index = patient

In [12]:
heights

A    1.50
B    1.66
C    1.80
D    1.30
dtype: float64

In [13]:
heights[0]

  heights[0]


np.float64(1.5)

### loc
Select and slice by index

In [14]:
heights["A"]

np.float64(1.5)

In [15]:
heights.loc["A"]

np.float64(1.5)

In [16]:
heights['A':'C']

A    1.50
B    1.66
C    1.80
dtype: float64

In [17]:
heights.loc['A':'C']

A    1.50
B    1.66
C    1.80
dtype: float64

### iloc
Select and slice by *position*

In [18]:
heights.iloc[0]

np.float64(1.5)

In [19]:
heights.iloc[-1]

np.float64(1.3)

In [21]:
dir(heights)

['A',
 'B',
 'C',
 'D',
 'T',
 '_AXIS_LEN',
 '_AXIS_ORDERS',
 '_AXIS_TO_AXIS_NUMBER',
 '_HANDLED_TYPES',
 '__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_ufunc__',
 '__bool__',
 '__class__',
 '__column_consortium_standard__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pandas_priority__',
 '__pos__',
 '__pow__',

## DataFrames

### Creating

In [25]:
data = {
    'cid'  : [1, 2, 3],
    'name' :['Thomas', 'Sian', 'Vicky'],
    'stood':[True, False, False]
}

df = pd.DataFrame(data)

In [26]:
df

Unnamed: 0,cid,name,stood
0,1,Thomas,True
1,2,Sian,False
2,3,Vicky,False


### Reading in from File

In [27]:
df = pd.read_csv('data/loan_data.csv')

In [28]:
df

Unnamed: 0,ID,Income,Term,Balance,Debt,Score,Default
0,567,17500,Short Term,1460,272,225.0,False
1,523,18500,Long Term,890,970,187.0,False
2,544,20700,Short Term,880,884,85.0,False
3,370,21600,Short Term,920,0,,False
4,756,24300,Short Term,1260,0,495.0,False
...,...,...,...,...,...,...,...
851,71,30000,Long Term,1270,3779,52.0,True
852,932,42500,Long Term,1550,0,779.0,False
853,39,36400,Long Term,1830,3032,360.0,True
854,283,42200,Long Term,1500,2498,417.0,False


In [29]:
loan_excel = pd.read_excel('data/loan_data.xlsx', sheet_name='March')

In [30]:
loan_excel

Unnamed: 0,ID,Income,Term,Balance,Debt,Score,Default
0,567,17500,Short Term,1460,272,225.0,False
1,523,18500,Long Term,890,970,187.0,False
2,544,20700,Short Term,880,884,85.0,False
3,370,21600,Short Term,920,0,,False
4,756,24300,Short Term,1260,0,495.0,False
...,...,...,...,...,...,...,...
851,71,30000,Long Term,1270,3779,52.0,True
852,932,42500,Long Term,1550,0,779.0,False
853,39,36400,Long Term,1830,3032,360.0,True
854,283,42200,Long Term,1500,2498,417.0,False


In [32]:
weather = pd.read_json('data/weather.json', orient='split')

In [33]:
weather

Unnamed: 0,temp,humidity,sun_hrs
2023-07-15,15.68,73.18,6.4
2023-07-16,25.16,83.88,8.06
2023-07-17,13.26,80.05,4.89
2023-07-18,24.63,82.37,9.13
2023-07-19,12.78,83.1,17.1
2023-07-20,23.52,85.35,0.72
2023-07-21,17.8,85.64,5.79
2023-07-22,24.98,76.81,10.95
2023-07-23,23.48,80.86,3.77
2023-07-24,23.3,79.96,14.62


In [36]:
import sqlite3

conn = sqlite3.connect("data/movies_db.sqlite")

movies_df = pd.read_sql("SELECT * FROM movies", conn)

movies_df

Unnamed: 0,id,name,year,rating
0,1,Who's Afraid of Virginia Woolf?,1966,10
1,2,Zardoz,1974,6
2,3,2001: A Space Odyssey,1968,9


### Working with DataFrames

In [37]:
df

Unnamed: 0,ID,Income,Term,Balance,Debt,Score,Default
0,567,17500,Short Term,1460,272,225.0,False
1,523,18500,Long Term,890,970,187.0,False
2,544,20700,Short Term,880,884,85.0,False
3,370,21600,Short Term,920,0,,False
4,756,24300,Short Term,1260,0,495.0,False
...,...,...,...,...,...,...,...
851,71,30000,Long Term,1270,3779,52.0,True
852,932,42500,Long Term,1550,0,779.0,False
853,39,36400,Long Term,1830,3032,360.0,True
854,283,42200,Long Term,1500,2498,417.0,False


In [38]:
df['Income']

0      17500
1      18500
2      20700
3      21600
4      24300
       ...  
851    30000
852    42500
853    36400
854    42200
855    30800
Name: Income, Length: 856, dtype: int64

In [39]:
df[['Income', 'Balance']]

Unnamed: 0,Income,Balance
0,17500,1460
1,18500,890
2,20700,880
3,21600,920
4,24300,1260
...,...,...
851,30000,1270
852,42500,1550
853,36400,1830
854,42200,1500


In [None]:
df[['Income', 'Balance']]

In [84]:
df.loc[:, ['Income', 'Balance']]

Unnamed: 0,Income,Balance
0,17500,1460
1,18500,890
2,20700,880
3,21600,920
4,24300,1260
...,...,...
851,30000,1270
852,42500,1550
853,36400,1830
854,42200,1500


In [44]:
df.iloc[0, 1]

np.int64(17500)

In [45]:
df.loc[:10, 'Income']

0     17500
1     18500
2     20700
3     21600
4     24300
5     22900
6     20400
7     24600
8     26500
9     25400
10    19700
Name: Income, dtype: int64

In [83]:
df.loc[[1, 3, 5, 9], :]

Unnamed: 0,ID,Income,Term,Balance,Debt,Score,Default,Daily Budget,MeanTermDebt
1,523,18500,Long Term,890,970,187.0,False,29.666667,715.823529
3,370,21600,Short Term,920,0,,False,30.666667,610.232877
5,929,22900,Long Term,1540,1229,383.0,False,51.333333,715.823529
9,621,25400,Short Term,1130,0,729.0,True,37.666667,610.232877


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 856 entries, 0 to 855
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   ID       856 non-null    int64  
 1   Income   856 non-null    int64  
 2   Term     856 non-null    object 
 3   Balance  856 non-null    int64  
 4   Debt     856 non-null    int64  
 5   Score    836 non-null    float64
 6   Default  856 non-null    bool   
dtypes: bool(1), float64(1), int64(4), object(1)
memory usage: 41.1+ KB


In [50]:
df['ID'] = df['ID'].astype('string')

In [51]:
df['Term'] = df['Term'].astype('string')

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 856 entries, 0 to 855
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   ID       856 non-null    string 
 1   Income   856 non-null    int64  
 2   Term     856 non-null    string 
 3   Balance  856 non-null    int64  
 4   Debt     856 non-null    int64  
 5   Score    836 non-null    float64
 6   Default  856 non-null    bool   
dtypes: bool(1), float64(1), int64(3), string(2)
memory usage: 41.1 KB


### Querying and Aggregation

In [54]:
df['Income'] > 20_000

0      False
1      False
2       True
3       True
4       True
       ...  
851     True
852     True
853     True
854     True
855     True
Name: Income, Length: 856, dtype: bool

In [56]:
df[df['Income'] > 20_000] # SELECT * FROM df WHERE Income > 20000

Unnamed: 0,ID,Income,Term,Balance,Debt,Score,Default
2,544,20700,Short Term,880,884,85.0,False
3,370,21600,Short Term,920,0,,False
4,756,24300,Short Term,1260,0,495.0,False
5,929,22900,Long Term,1540,1229,383.0,False
6,373,20400,Short Term,1200,0,556.0,False
...,...,...,...,...,...,...,...
851,71,30000,Long Term,1270,3779,52.0,True
852,932,42500,Long Term,1550,0,779.0,False
853,39,36400,Long Term,1830,3032,360.0,True
854,283,42200,Long Term,1500,2498,417.0,False


In [59]:
df[(df['Income'] > 20_000) & (df['Balance'] < 1000)]

Unnamed: 0,ID,Income,Term,Balance,Debt,Score,Default
2,544,20700,Short Term,880,884,85.0,False
3,370,21600,Short Term,920,0,,False
8,284,26500,Long Term,720,1866,243.0,False
11,763,20600,Short Term,820,1218,136.0,False
15,24,23200,Short Term,610,0,325.0,False
...,...,...,...,...,...,...,...
825,942,51700,Long Term,970,6409,92.0,True
826,589,44200,Long Term,810,293,908.0,False
833,269,24200,Long Term,890,0,629.0,False
837,802,23900,Short Term,640,0,452.0,False


In [63]:
df['Daily Budget'] = df['Balance'] / 30

In [66]:
df['Income']/df['Balance']

0      11.986301
1      20.786517
2      23.522727
3      23.478261
4      19.285714
         ...    
851    23.622047
852    27.419355
853    19.890710
854    28.133333
855    25.882353
Length: 856, dtype: float64

In [68]:
df[['Term', 'Balance']].groupby('Term').mean()

Unnamed: 0_level_0,Balance
Term,Unnamed: 1_level_1
Long Term,1334.080882
Short Term,1158.561644


In [77]:
df[['Term', 'Balance', 'Default', 'Income']].groupby(['Term', 'Default']).std()

Unnamed: 0_level_0,Unnamed: 1_level_0,Balance,Income
Term,Default,Unnamed: 2_level_1,Unnamed: 3_level_1
Long Term,False,678.981768,15040.624461
Long Term,True,324.488017,12544.341139
Short Term,False,557.829841,13289.141677
Short Term,True,460.50547,10012.332934


In [80]:
df['MeanTermDebt'] = df[['Debt', 'Term']].groupby('Term').transform('mean')

In [81]:
df

Unnamed: 0,ID,Income,Term,Balance,Debt,Score,Default,Daily Budget,MeanTermDebt
0,567,17500,Short Term,1460,272,225.0,False,48.666667,610.232877
1,523,18500,Long Term,890,970,187.0,False,29.666667,715.823529
2,544,20700,Short Term,880,884,85.0,False,29.333333,610.232877
3,370,21600,Short Term,920,0,,False,30.666667,610.232877
4,756,24300,Short Term,1260,0,495.0,False,42.000000,610.232877
...,...,...,...,...,...,...,...,...,...
851,71,30000,Long Term,1270,3779,52.0,True,42.333333,715.823529
852,932,42500,Long Term,1550,0,779.0,False,51.666667,715.823529
853,39,36400,Long Term,1830,3032,360.0,True,61.000000,715.823529
854,283,42200,Long Term,1500,2498,417.0,False,50.000000,715.823529
