In [None]:
%matplotlib inline

# importing pandas and numpy
import pandas as pd
import numpy as np

import os

In [None]:
# download dataset
from urllib.request import urlretrieve
filename = os.path.join('..', 'data', 'datasets', 'kc_house_data.csv')
if not os.path.isfile(filename):
    print("Downloading %s" % filename)
    urlretrieve('https://s3.eu-central-1.amazonaws.com/ai-captain-public/datasets/kc_house_data.csv', filename) 

# about CSV

In [None]:
# read a CSV into a dataframe
df = pd.read_csv(filename)

In [None]:
# show shape
df.shape

In [None]:
# show the first couple of rows (default is 5 rows)
df.head()

In [None]:
# show the last couple of rows (default is 5 rows)
df.tail()

In [None]:
# show columns (in a list)
list(df)

In [None]:
# selecting a column
df['bathrooms'] # or df.bathrooms

In [None]:
# selecting data by row numbers (.iloc)
df.iloc[5:10]

In [None]:
# selecting the values for bedrooms from the rows shown above
df.iloc[5:10]['bedrooms'] # or df['bedrooms'][5:10]

In [None]:
# Selecting data by label or by a conditional statment (.loc)
df.loc[df.bedrooms > 10]

In [None]:
# getting the maximum of a column
df['sqft_living'].max()   # or df.sqft_living.max()

In [None]:
# getting the minimum of a comumn
df['sqft_living'].min()   # or df.sqft_living.min()

In [None]:
# Clip sqft_living to 10000; i.e. set sqft_living > 10000 to 10000, using mask
df["sqft_living"] = df["sqft_living"].mask(df["sqft_living"] > 10000, 10000)
df['sqft_living'].max() 

In [None]:
# create dataframe from series
a = pd.Series([1, 2, 3, np.nan, np.nan, 4, 5])
b = pd.Series(['dsads', 'dsda', np.nan, 'dsad', 'dsad', 'dasda', 'dsda'])
df1 = pd.DataFrame({'a': a, 'b': b})
df1

In [None]:
# count missing values
df1.isna().sum()

In [None]:
# fill missing values
df1['a'].fillna(0, inplace=True)
df1['b'].fillna('nope', inplace=True)
df1

In [None]:
# create dataframe from series
a = pd.Series([1, 2, 3, np.nan, np.nan, 4, 5])
b = pd.Series(['dsads', 'dsda', np.nan, 'dsad', 'dsad', 'dasda', 'dsda'])
df_1 = pd.DataFrame({'a': a, 'b': b})
df_1

In [None]:
# propagate last valid observation forward
df_1['a'].fillna(method='ffill', inplace=True)
df_1

In [None]:
# Drop rows that contain missing values
df_1.dropna(inplace = True)
df_1

In [None]:
# sorting
df.sort_values('bathrooms', ascending=False).head()

In [None]:
# basic plotting
df.plot.scatter(x='sqft_living', y='sqft_above')

In [None]:
# groupby
df.groupby('condition').price.mean().plot(kind='bar')

In [None]:
# merge
df2 = pd.DataFrame([{'b': 'dsad', 'c': 22}, {'b': 'nope', 'c': 300}])
pd.merge(df1, df2, on='b', how='left').dropna()

In [None]:
corr = df.corr()
corr.style.background_gradient()