<a href="https://colab.research.google.com/github/thihanaung-thnn/notes_Python/blob/main/notes_jose_portilla_lectures/machine_learning_with_python/02_pandas_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pandas 

## Series
- Series is similar to NumPy array.

In [39]:
import numpy as np
import pandas as pd

In [40]:
index = ['First','Second','Third']
data = [22,33,44]
pd.Series(data = data, index = index)

First     22
Second    33
Third     44
dtype: int64

In [41]:
rand_data = np.random.randint(0,100,5)
names = ['Chou','Saber','Franco','Angela','Myia']
power = pd.Series(rand_data, names)
power # names are indices. 

Chou      77
Saber     74
Franco    73
Angela    75
Myia      52
dtype: int64

In [42]:
# from dictionary
ages = {'Tim':22, 'John':33, 'Rose':22}
pds = pd.Series(ages)
pds

Tim     22
John    33
Rose    22
dtype: int64

In [43]:
type(pds)

pandas.core.series.Series

In [44]:
# named index 
# same as indexing with list or dictionaries
pds['Tim']

22

In [45]:
pds[0]

22

In [46]:
# Operations
pds.keys()

Index(['Tim', 'John', 'Rose'], dtype='object')

In [47]:
pds*10

Tim     220
John    330
Rose    220
dtype: int64

In [48]:
pds/2

Tim     11.0
John    16.5
Rose    11.0
dtype: float64

In [49]:
pds * pds

Tim      484
John    1089
Rose     484
dtype: int64

## DataFrames
- DataFrame consists of multiple Pandas Series that share index values

### Creating DataFrame

In [50]:
# help(pd.DataFrame)
np.random.seed(13)
data = np.random.randint(0,101, (4,3))
data

array([[82, 48, 74],
       [16, 98, 25],
       [58, 99, 26],
       [66, 46, 54]])

In [51]:
names = ['Tigeral','Minotour','Johnson','Uranus']
col = ['Str','Int','Att']
pd.DataFrame(data=data)


Unnamed: 0,0,1,2
0,82,48,74
1,16,98,25
2,58,99,26
3,66,46,54


In [52]:
df = pd.DataFrame(data = data, index=names, columns = col)
df

Unnamed: 0,Str,Int,Att
Tigeral,82,48,74
Minotour,16,98,25
Johnson,58,99,26
Uranus,66,46,54


### Glimpsing DataFrame

In [53]:
data = pd.read_csv('sample_data/california_housing_train.csv')
df = data.copy()
df = df.iloc[0:100]

In [54]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value'],
      dtype='object')

In [55]:
df.index

RangeIndex(start=0, stop=100, step=1)

In [56]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [57]:
df.tail(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
97,-115.58,32.79,14.0,1687.0,507.0,762.0,451.0,1.6635,64400.0
98,-115.58,32.78,5.0,2494.0,414.0,1416.0,421.0,5.7843,110100.0
99,-115.59,32.85,20.0,1608.0,274.0,862.0,248.0,4.875,90800.0


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           100 non-null    float64
 1   latitude            100 non-null    float64
 2   housing_median_age  100 non-null    float64
 3   total_rooms         100 non-null    float64
 4   total_bedrooms      100 non-null    float64
 5   population          100 non-null    float64
 6   households          100 non-null    float64
 7   median_income       100 non-null    float64
 8   median_house_value  100 non-null    float64
dtypes: float64(9)
memory usage: 7.2 KB


In [59]:
len(df)

100

In [60]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,-115.2991,33.1441,23.16,1677.85,376.22,1077.71,323.59,2.462202,72825.0
std,0.389058,0.597297,9.234017,1070.454528,260.708963,675.559585,183.525359,1.168484,22706.232736
min,-115.59,32.67,5.0,44.0,24.0,29.0,15.0,0.8571,25000.0
25%,-115.55,32.79,17.0,1149.25,231.5,669.75,206.25,1.64805,60550.0
50%,-115.515,32.845,21.0,1492.5,323.5,949.5,286.0,2.1073,68850.0
75%,-115.295,33.2875,30.0,1989.75,443.5,1314.0,412.75,3.1962,84950.0
max,-114.31,34.91,48.0,7650.0,1901.0,3565.0,1056.0,6.2715,146300.0


In [61]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
longitude,100.0,-115.2991,0.389058,-115.59,-115.55,-115.515,-115.295,-114.31
latitude,100.0,33.1441,0.597297,32.67,32.79,32.845,33.2875,34.91
housing_median_age,100.0,23.16,9.234017,5.0,17.0,21.0,30.0,48.0
total_rooms,100.0,1677.85,1070.454528,44.0,1149.25,1492.5,1989.75,7650.0
total_bedrooms,100.0,376.22,260.708963,24.0,231.5,323.5,443.5,1901.0
population,100.0,1077.71,675.559585,29.0,669.75,949.5,1314.0,3565.0
households,100.0,323.59,183.525359,15.0,206.25,286.0,412.75,1056.0
median_income,100.0,2.462202,1.168484,0.8571,1.64805,2.1073,3.1962,6.2715
median_house_value,100.0,72825.0,22706.232736,25000.0,60550.0,68850.0,84950.0,146300.0


### Selection and Indexing

In [62]:
# columnns - single column
df['population']

0     1015.0
1     1129.0
2      333.0
3      515.0
4      624.0
       ...  
95     458.0
96     533.0
97     762.0
98    1416.0
99     862.0
Name: population, Length: 100, dtype: float64

In [63]:
type(df['population'])

pandas.core.series.Series

In [64]:
# multiple columns
df[['population','households']]

Unnamed: 0,population,households
0,1015.0,472.0
1,1129.0,463.0
2,333.0,117.0
3,515.0,226.0
4,624.0,262.0
...,...,...
95,458.0,143.0
96,533.0,201.0
97,762.0,451.0
98,1416.0,421.0


In [65]:
# adding new columns 
df['new_column'] = df['population']/df['households']
df['new_column'] = np.round(df['new_column'], 2)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,new_column
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0,2.15
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0,2.44
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0,2.85
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0,2.28
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0,2.38


In [66]:
# remove column 
df = df.drop('new_column', axis = 1)
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


### Indexing

In [67]:
df['rand_col'] = np.arange(len(df))
df['rand_col'] = ['A' + str(i) for i in df['rand_col']]
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rand_col
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0,A0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0,A1
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0,A2
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0,A3
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0,A4


In [68]:
df.index

RangeIndex(start=0, stop=100, step=1)

In [69]:
df = df.set_index('rand_col')
df.head()

Unnamed: 0_level_0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
rand_col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
A1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
A2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
A3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
A4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [70]:
# select single row
# integer based
df.iloc[0]

longitude              -114.3100
latitude                 34.1900
housing_median_age       15.0000
total_rooms            5612.0000
total_bedrooms         1283.0000
population             1015.0000
households              472.0000
median_income             1.4936
median_house_value    66900.0000
Name: A0, dtype: float64

In [71]:
# index name based
df.loc['A1']

longitude              -114.47
latitude                 34.40
housing_median_age       19.00
total_rooms            7650.00
total_bedrooms         1901.00
population             1129.00
households              463.00
median_income             1.82
median_house_value    80100.00
Name: A1, dtype: float64

In [72]:
# multiple rows
df.iloc[0:3]

Unnamed: 0_level_0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
rand_col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
A1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
A2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0


In [73]:
df.loc[['A1','A3']]

Unnamed: 0_level_0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
rand_col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
A3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0


In [74]:
# remove rows
df.drop('A0', axis=0).head()
# error if use index

Unnamed: 0_level_0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
rand_col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
A2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
A3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
A4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0
A5,-114.58,33.63,29.0,1387.0,236.0,671.0,239.0,3.3438,74000.0


In [75]:
# append row
one_row = df.iloc[0]
df.append(one_row).tail()

Unnamed: 0_level_0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
rand_col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A96,-115.58,32.81,10.0,1088.0,203.0,533.0,201.0,3.6597,87500.0
A97,-115.58,32.79,14.0,1687.0,507.0,762.0,451.0,1.6635,64400.0
A98,-115.58,32.78,5.0,2494.0,414.0,1416.0,421.0,5.7843,110100.0
A99,-115.59,32.85,20.0,1608.0,274.0,862.0,248.0,4.875,90800.0
A0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0


## Conditional Filtering

In [79]:
df = pd.read_csv('sample_data/tips.csv')

In [80]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251


In [86]:
bool_series = df['total_bill'] > 45
bool_series

0      False
1      False
2      False
3      False
4      False
       ...  
239    False
240    False
241    False
242    False
243    False
Name: total_bill, Length: 244, dtype: bool

In [87]:
df[bool_series]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
59,48.27,6.73,Male,No,Sat,Dinner,4,12.07,Brian Ortiz,6596453823950595,Sat8139
156,48.17,5.0,Male,No,Sun,Dinner,6,8.03,Ryan Gonzales,3523151482063321,Sun7518
170,50.81,10.0,Male,Yes,Sat,Dinner,3,16.94,Gregory Clark,5473850968388236,Sat1954
182,45.35,3.5,Male,Yes,Sun,Dinner,3,15.12,Jose Parsons,4112207559459910,Sun2337
212,48.33,9.0,Male,No,Sat,Dinner,4,12.08,Alex Williamson,676218815212,Sat4590


In [88]:
df[df['total_bill'] > 45]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
59,48.27,6.73,Male,No,Sat,Dinner,4,12.07,Brian Ortiz,6596453823950595,Sat8139
156,48.17,5.0,Male,No,Sun,Dinner,6,8.03,Ryan Gonzales,3523151482063321,Sun7518
170,50.81,10.0,Male,Yes,Sat,Dinner,3,16.94,Gregory Clark,5473850968388236,Sat1954
182,45.35,3.5,Male,Yes,Sun,Dinner,3,15.12,Jose Parsons,4112207559459910,Sun2337
212,48.33,9.0,Male,No,Sat,Dinner,4,12.08,Alex Williamson,676218815212,Sat4590


In [91]:
# multiple conditions
df[(df['total_bill'] > 45) & (df['smoker'] == 'Yes')]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
170,50.81,10.0,Male,Yes,Sat,Dinner,3,16.94,Gregory Clark,5473850968388236,Sat1954
182,45.35,3.5,Male,Yes,Sun,Dinner,3,15.12,Jose Parsons,4112207559459910,Sun2337


In [92]:
df[(df['total_bill'] > 45) & ~(df['smoker'] == 'Yes')] # or (df['smoker'] != 'Yes')
# and -> &, or -> | 

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
59,48.27,6.73,Male,No,Sat,Dinner,4,12.07,Brian Ortiz,6596453823950595,Sat8139
156,48.17,5.0,Male,No,Sun,Dinner,6,8.03,Ryan Gonzales,3523151482063321,Sun7518
212,48.33,9.0,Male,No,Sat,Dinner,4,12.08,Alex Williamson,676218815212,Sat4590


In [94]:
#.isin()
df[df['day'].isin(['Sat','Sun'])]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.50,Male,No,Sun,Dinner,3,7.00,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251
...,...,...,...,...,...,...,...,...,...,...,...
238,35.83,4.67,Female,No,Sat,Dinner,3,11.94,Kimberly Crane,676184013727,Sat9777
239,29.03,5.92,Male,No,Sat,Dinner,3,9.68,Michael Avila,5296068606052842,Sat2657
240,27.18,2.00,Female,Yes,Sat,Dinner,2,13.59,Monica Sanders,3506806155565404,Sat1766
241,22.67,2.00,Male,Yes,Sat,Dinner,2,11.34,Keith Wong,6011891618747196,Sat3880


## Missing Data

In [95]:
np.nan # float data , None for object_dtype

nan

In [96]:
pd.NA 

<NA>

In [97]:
pd.NaT # datetime_like data

NaT

In [98]:
np.nan == np.nan

False

In [99]:
np.nan in [np.nan]

True

In [100]:
np.nan is np.nan

True

In [101]:
pd.NA == pd.NA

<NA>

In [102]:
df = pd.read_csv('sample_data/movie_scores.csv')
df

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,,
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [103]:
df.isnull()

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,False,False,False,False,False,False
1,True,True,True,True,True,True
2,False,False,False,False,True,True
3,False,False,False,False,False,False
4,False,False,False,False,False,False


In [104]:
df.notnull()

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,True,True,True,True,True,True
1,False,False,False,False,False,False
2,True,True,True,True,False,False
3,True,True,True,True,True,True
4,True,True,True,True,True,True


In [105]:
df['first_name']

0      Tom
1      NaN
2     Hugh
3    Oprah
4     Emma
Name: first_name, dtype: object

In [106]:
df[df['first_name'].notnull()]

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [107]:
df[(df['pre_movie_score'].isnull()) & df['sex'].notnull()]

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
2,Hugh,Jackman,51.0,m,,


In [108]:
# drop data
df.dropna()

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [109]:
df.dropna(thresh=1)

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [110]:
df.dropna(axis=1)

0
1
2
3
4


In [111]:
df.dropna(thresh=4, axis=1)

Unnamed: 0,first_name,last_name,age,sex
0,Tom,Hanks,63.0,m
1,,,,
2,Hugh,Jackman,51.0,m
3,Oprah,Winfrey,66.0,f
4,Emma,Stone,31.0,f


In [112]:
# fill data
df.fillna('New Value')

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63,m,8,10
1,New Value,New Value,New Value,New Value,New Value,New Value
2,Hugh,Jackman,51,m,New Value,New Value
3,Oprah,Winfrey,66,f,6,8
4,Emma,Stone,31,f,7,9


In [113]:
df['first_name'].fillna('Some_name')

0          Tom
1    Some_name
2         Hugh
3        Oprah
4         Emma
Name: first_name, dtype: object

In [114]:
df['pre_movie_score'].mean()

7.0

In [116]:
df['pre_movie_score'].fillna(df['pre_movie_score'].mean()) # filling with mean score

0    8.0
1    7.0
2    7.0
3    6.0
4    7.0
Name: pre_movie_score, dtype: float64

In [117]:
df.fillna(df.mean())

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,52.75,,7.0,9.0
2,Hugh,Jackman,51.0,m,7.0,9.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [118]:
# filling with interpolation
data = {'first':100, 'second':np.nan, 'third':50,'fourth':40}
data = pd.Series(data)
data

first     100.0
second      NaN
third      50.0
fourth     40.0
dtype: float64

In [119]:
data.interpolate() # default method is 'linear'.

first     100.0
second     75.0
third      50.0
fourth     40.0
dtype: float64

In [121]:
df = pd.DataFrame(data, columns = ['Number'])
df

Unnamed: 0,Number
first,100.0
second,
third,50.0
fourth,40.0


In [122]:
df = df.reset_index()
df

Unnamed: 0,index,Number
0,first,100.0
1,second,
2,third,50.0
3,fourth,40.0


In [124]:
df.interpolate(method='spline', order=2)

Unnamed: 0,index,Number
0,first,100.0
1,second,70.0
2,third,50.0
3,fourth,40.0


In [127]:
help(df.interpolate)

Help on method interpolate in module pandas.core.generic:

interpolate(method: str = 'linear', axis: Union[str, int] = 0, limit: Union[int, NoneType] = None, inplace: bool = False, limit_direction: Union[str, NoneType] = None, limit_area: Union[str, NoneType] = None, downcast: Union[str, NoneType] = None, **kwargs) -> Union[~FrameOrSeries, NoneType] method of pandas.core.frame.DataFrame instance
    Please note that only ``method='linear'`` is supported for
    DataFrame/Series with a MultiIndex.
    
    Parameters
    ----------
    method : str, default 'linear'
        Interpolation technique to use. One of:
    
        * 'linear': Ignore the index and treat the values as equally
          spaced. This is the only method supported on MultiIndexes.
        * 'time': Works on daily and higher resolution data to interpolate
          given length of interval.
        * 'index', 'values': use the actual numerical values of the index.
        * 'pad': Fill in NaNs using existing values

## Inputs and Outputs

<table border="1" class="colwidths-given docutils">
<colgroup>
<col width="12%" />
<col width="40%" />
<col width="24%" />
<col width="24%" />
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Format Type</th>
<th class="head">Data Description</th>
<th class="head">Reader</th>
<th class="head">Writer</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td>text</td>
<td><a class="reference external" href="https://en.wikipedia.org/wiki/Comma-separated_values">CSV</a></td>
<td><a class="reference internal" href="#io-read-csv-table"><span class="std std-ref">read_csv</span></a></td>
<td><a class="reference internal" href="#io-store-in-csv"><span class="std std-ref">to_csv</span></a></td>
</tr>
<tr class="row-odd"><td>text</td>
<td><a class="reference external" href="https://www.json.org/">JSON</a></td>
<td><a class="reference internal" href="#io-json-reader"><span class="std std-ref">read_json</span></a></td>
<td><a class="reference internal" href="#io-json-writer"><span class="std std-ref">to_json</span></a></td>
</tr>
<tr class="row-even"><td>text</td>
<td><a class="reference external" href="https://en.wikipedia.org/wiki/HTML">HTML</a></td>
<td><a class="reference internal" href="#io-read-html"><span class="std std-ref">read_html</span></a></td>
<td><a class="reference internal" href="#io-html"><span class="std std-ref">to_html</span></a></td>
</tr>
<tr class="row-odd"><td>text</td>
<td>Local clipboard</td>
<td><a class="reference internal" href="#io-clipboard"><span class="std std-ref">read_clipboard</span></a></td>
<td><a class="reference internal" href="#io-clipboard"><span class="std std-ref">to_clipboard</span></a></td>
</tr>
<tr class="row-even"><td>binary</td>
<td><a class="reference external" href="https://en.wikipedia.org/wiki/Microsoft_Excel">MS Excel</a></td>
<td><a class="reference internal" href="#io-excel-reader"><span class="std std-ref">read_excel</span></a></td>
<td><a class="reference internal" href="#io-excel-writer"><span class="std std-ref">to_excel</span></a></td>
</tr>
<tr class="row-odd"><td>binary</td>
<td><a class="reference external" href="http://www.opendocumentformat.org">OpenDocument</a></td>
<td><a class="reference internal" href="#io-ods"><span class="std std-ref">read_excel</span></a></td>
<td>&#160;</td>
</tr>
<tr class="row-even"><td>binary</td>
<td><a class="reference external" href="https://support.hdfgroup.org/HDF5/whatishdf5.html">HDF5 Format</a></td>
<td><a class="reference internal" href="#io-hdf5"><span class="std std-ref">read_hdf</span></a></td>
<td><a class="reference internal" href="#io-hdf5"><span class="std std-ref">to_hdf</span></a></td>
</tr>
<tr class="row-odd"><td>binary</td>
<td><a class="reference external" href="https://github.com/wesm/feather">Feather Format</a></td>
<td><a class="reference internal" href="#io-feather"><span class="std std-ref">read_feather</span></a></td>
<td><a class="reference internal" href="#io-feather"><span class="std std-ref">to_feather</span></a></td>
</tr>
<tr class="row-even"><td>binary</td>
<td><a class="reference external" href="https://parquet.apache.org/">Parquet Format</a></td>
<td><a class="reference internal" href="#io-parquet"><span class="std std-ref">read_parquet</span></a></td>
<td><a class="reference internal" href="#io-parquet"><span class="std std-ref">to_parquet</span></a></td>
</tr>
<tr class="row-odd"><td>binary</td>
<td><a class="reference external" href="https://msgpack.org/index.html">Msgpack</a></td>
<td><a class="reference internal" href="#io-msgpack"><span class="std std-ref">read_msgpack</span></a></td>
<td><a class="reference internal" href="#io-msgpack"><span class="std std-ref">to_msgpack</span></a></td>
</tr>
<tr class="row-even"><td>binary</td>
<td><a class="reference external" href="https://en.wikipedia.org/wiki/Stata">Stata</a></td>
<td><a class="reference internal" href="#io-stata-reader"><span class="std std-ref">read_stata</span></a></td>
<td><a class="reference internal" href="#io-stata-writer"><span class="std std-ref">to_stata</span></a></td>
</tr>
<tr class="row-odd"><td>binary</td>
<td><a class="reference external" href="https://en.wikipedia.org/wiki/SAS_(software)">SAS</a></td>
<td><a class="reference internal" href="#io-sas-reader"><span class="std std-ref">read_sas</span></a></td>
<td>&#160;</td>
</tr>
<tr class="row-even"><td>binary</td>
<td><a class="reference external" href="https://docs.python.org/3/library/pickle.html">Python Pickle Format</a></td>
<td><a class="reference internal" href="#io-pickle"><span class="std std-ref">read_pickle</span></a></td>
<td><a class="reference internal" href="#io-pickle"><span class="std std-ref">to_pickle</span></a></td>
</tr>
<tr class="row-odd"><td>SQL</td>
<td><a class="reference external" href="https://en.wikipedia.org/wiki/SQL">SQL</a></td>
<td><a class="reference internal" href="#io-sql"><span class="std std-ref">read_sql</span></a></td>
<td><a class="reference internal" href="#io-sql"><span class="std std-ref">to_sql</span></a></td>
</tr>
<tr class="row-even"><td>SQL</td>
<td><a class="reference external" href="https://en.wikipedia.org/wiki/BigQuery">Google Big Query</a></td>
<td><a class="reference internal" href="#io-bigquery"><span class="std std-ref">read_gbq</span></a></td>
<td><a class="reference internal" href="#io-bigquery"><span class="std std-ref">to_gbq</span></a></td>
</tr>
</tbody>
</table>

> pwd = current working directory  
> ls = file lists in that directory

- pd.read_csv('path/file.csv')
- df.to_csv('new_file.csv', index=False)
- pd.read_html('url')
- df.to_html('name.html', index=False)
- pd.read_excel('name.xlsx', sheet_name = 'sheet')
- pd.ExcelFile('name.xlsx').sheet_names # return names of sheets
- df.to_excel('name.xlsx', sheet_name = 'name', index=False)

In [130]:
# MySQL
from sqlalchemy import create_engine

In [131]:
temp_db = create_engine('sqlite:///:memory:')
temp_db

Engine(sqlite:///:memory:)

In [134]:
# Write to db
tables = pd.read_html('https://en.wikipedia.org/wiki/World_population')
tables[6]

Unnamed: 0,Rank,Country,Population,Area(km2),Density(pop/km2),Population trend
0,1,India,1381950000,3287240,420,Growing
1,2,Pakistan,225000000,803940,280,Rapidly growing
2,3,Bangladesh,171360000,143998,1190,Rapidly growing
3,4,Japan,126010000,377873,333,Declining[98]
4,5,Philippines,110830000,300000,369,Growing
5,6,Vietnam,96209000,331689,290,Growing
6,7,United Kingdom,66436000,243610,273,Growing
7,8,South Korea,51781000,99538,520,Steady
8,9,Taiwan,23604000,36193,652,Steady
9,10,Sri Lanka,21803000,65610,332,Growing


In [135]:
tables[6].to_sql(name='populations', con=temp_db)

In [136]:
# read from sql
pd.read_sql(sql='populations', con = temp_db)

Unnamed: 0,index,Rank,Country,Population,Area(km2),Density(pop/km2),Population trend
0,0,1,India,1381950000,3287240,420,Growing
1,1,2,Pakistan,225000000,803940,280,Rapidly growing
2,2,3,Bangladesh,171360000,143998,1190,Rapidly growing
3,3,4,Japan,126010000,377873,333,Declining[98]
4,4,5,Philippines,110830000,300000,369,Growing
5,5,6,Vietnam,96209000,331689,290,Growing
6,6,7,United Kingdom,66436000,243610,273,Growing
7,7,8,South Korea,51781000,99538,520,Steady
8,8,9,Taiwan,23604000,36193,652,Steady
9,9,10,Sri Lanka,21803000,65610,332,Growing


In [141]:
# read in with a query
pd.read_sql(sql = "SELECT Country,Population FROM populations WHERE Population > 100000000", con=temp_db)

Unnamed: 0,Country,Population
0,India,1381950000
1,Pakistan,225000000
2,Bangladesh,171360000
3,Japan,126010000
4,Philippines,110830000
