### tutorial five: handle missing data: fillna, dropna, iterpolate

In [14]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/codebasics/py/master/pandas/5_handling_missing_data_fillna_dropna_interpolate/weather_data.csv', parse_dates = ["day"])
df.set_index('day', inplace=True)
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [16]:
new_df = df.fillna('Zayn')
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,Zayn,9.0,Sunny
2017-01-05,28.0,Zayn,Snow
2017-01-06,Zayn,7.0,Zayn
2017-01-07,32.0,Zayn,Rain
2017-01-08,Zayn,Zayn,Sunny
2017-01-09,Zayn,Zayn,Zayn
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [17]:
# to fill specific values in data frame, use the .fillna() with a dictionary
second_df = df.fillna({
    'temperature': 0, 
    'windspeed': 0,
    'event': 'no event'
})
second_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,9.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,no event
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,no event
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [19]:
new_df = df.interpolate(method = "time")
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,29.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


### tutorial 6: replace function (handle missing data)

In [24]:
import numpy as np
df2 = pd.read_csv('https://raw.githubusercontent.com/codebasics/py/master/pandas/6_handling_missing_data_replace/weather_data.csv')
df2
#for index, row in df2.iterrows():
    #print(row)


Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/2/2017,-99999,7,Sunny
2,1/3/2017,28,-99999,Snow
3,1/4/2017,-99999,7,0
4,1/5/2017,32,-99999,Rain
5,1/6/2017,31,2,Sunny
6,1/6/2017,34,5,0


In [26]:
# replace -99999 with NaN

df2 = df.replace(-99999, 'NaN')
df2

#if there are many values you'd like to replace, use a list:

#new_df = df.replace([-99999, -88888], np.NaN)

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


### tutorial 7. Group By (Split Apply Combine)

In [32]:
df3 = pd.read_csv('https://raw.githubusercontent.com/codebasics/py/master/pandas/7_group_by/weather_by_cities.csv')
df3

Unnamed: 0,day,city,temperature,windspeed,event
0,1/1/2017,new york,32,6,Rain
1,1/2/2017,new york,36,7,Sunny
2,1/3/2017,new york,28,12,Snow
3,1/4/2017,new york,33,7,Sunny
4,1/1/2017,mumbai,90,5,Sunny
5,1/2/2017,mumbai,85,12,Fog
6,1/3/2017,mumbai,87,15,Fog
7,1/4/2017,mumbai,92,5,Rain
8,1/1/2017,paris,45,20,Sunny
9,1/2/2017,paris,50,13,Cloudy


In [36]:
g = df.groupby('city')
g

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ff0400559d0>


In [44]:
for city, city_df in g:
    print(city)
    print(city_df)

mumbai
        day    city  temperature  windspeed  event
4  1/1/2017  mumbai           90          5  Sunny
5  1/2/2017  mumbai           85         12    Fog
6  1/3/2017  mumbai           87         15    Fog
7  1/4/2017  mumbai           92          5   Rain
new york
        day      city  temperature  windspeed  event
0  1/1/2017  new york           32          6   Rain
1  1/2/2017  new york           36          7  Sunny
2  1/3/2017  new york           28         12   Snow
3  1/4/2017  new york           33          7  Sunny
paris
         day   city  temperature  windspeed   event
8   1/1/2017  paris           45         20   Sunny
9   1/2/2017  paris           50         13  Cloudy
10  1/3/2017  paris           54          8  Cloudy
11  1/4/2017  paris           42         10  Cloudy


### tutorial 8: concat dataframes

In [46]:
africa_weather = pd.DataFrame({
    'country': ['nigeria', 'kenya', 'republic of congo'],
    'temperature': [30, 45, 89],
    'humidity': [43, 2, 23]
})
africa_weather

Unnamed: 0,country,temperature,humidity
0,nigeria,30,43
1,kenya,45,2
2,republic of congo,89,23


In [49]:
us_weather = pd.DataFrame({
    'country': ['washington', 'massachusetts', 'pennsylvania'],
    'temperature': [50, 38, 44],
    'humidity': [2, 21, 12]
})
us_weather

Unnamed: 0,country,temperature,humidity
0,washington,50,2
1,massachusetts,38,21
2,pennsylvania,44,12


In [54]:
dfs = pd.concat([africa_weather, us_weather], keys = ['africa', 'us'])
dfs

dfs.loc['africa']

Unnamed: 0,country,temperature,humidity
0,nigeria,30,43
1,kenya,45,2
2,republic of congo,89,23


In [56]:
windspeed_df = pd.DataFrame({
    'city': ['mumbai', 'delhi', 'bangalore'],
    'windspeed': [7, 12, 9]
})

another_df = pd.DataFrame({
    'city': ['mumbai', 'delhi', 'bangalore'],
    'temperature': [70, 33, 49]
})

In [60]:
# how do you remove city from the output?
concat = pd.concat([windspeed_df, another_df], axis=1)
concat

Unnamed: 0,city,windspeed,city.1,temperature
0,mumbai,7,mumbai,70
1,delhi,12,delhi,33
2,bangalore,9,bangalore,49


In [62]:
s = pd.Series(["Humid", "Dry", "Rain"], name="event")
s

0    Humid
1      Dry
2     Rain
Name: event, dtype: object

In [66]:
df = pd.concat([windspeed_df, s], axis=1)
df

Unnamed: 0,city,windspeed,event
0,mumbai,7,Humid
1,delhi,12,Dry
2,bangalore,9,Rain


### tutorial 9: merge dataframes

In [68]:
df1 = pd.DataFrame({
    'city': ['new york', 'chicago', 'orlando'],
    'temperature': [21, 14, 35],
})
df1

Unnamed: 0,city,temperature
0,new york,21
1,chicago,14
2,orlando,35


In [70]:
df2 = pd.DataFrame({
    'city': ['chicago', 'new york', 'orlando'],
    'humidity': [65, 68, 75]
})
df2

Unnamed: 0,city,humidity
0,chicago,65
1,new york,68
2,orlando,75


In [71]:
df3 = pd.merge(df1, df2, on = "city")
df3

Unnamed: 0,city,temperature,humidity
0,new york,21,68
1,chicago,14,65
2,orlando,35,75


In [76]:
df1 = pd.DataFrame({
    'city': ['new york', 'chicago', 'orlando', 'baltimore'],
    'temperature': [21, 14, 35, 80],
})
df1

df2 = pd.DataFrame({
    'city': ['chicago', 'new york', 'san francisco'],
    'humidity': [65, 68, 75]
})
df2

df3 = pd.merge(df1, df2, on = "city", how="outer", indicator=True)
df3

Unnamed: 0,city,temperature,humidity,_merge
0,new york,21.0,68.0,both
1,chicago,14.0,65.0,both
2,orlando,35.0,,left_only
3,baltimore,80.0,,left_only
4,san francisco,,75.0,right_only
