<div style="margin: 10px">
<a href="http://pandas.pydata.org"><img style="margin:8px; display:inline; object-fit:scale-down; max-height:140px" src="./assets/pandas.png"/></a>
</div>

## Reading data


In [None]:
import pandas as pd

### Data sources
* Can be a local file
* Can be the internet
* Can be a database server, etc. 
* Formats: csv, excel, stata, etc.

Reading data from a URL

In [None]:
# read a dataset of alcohol consumption into a DataFrame
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()

#### Describe and explore the data

In [None]:
drinks.describe()

In [None]:
drinks.info()

#### Adding and deleting new columns

In [None]:
drinks['total_serving'] = drinks['beer_servings'] + drinks['spirit_servings'] + drinks['wine_servings']
drinks.head()

In [None]:
drinks = drinks.drop(['total_serving'], axis=1)
drinks.head()

In [None]:
# calculate the mean beer servings just for countries in Africa
drinks[drinks.continent=='Africa'].beer_servings.mean()

#### Grouping

In [None]:
# calculate the mean beer servings for each continent
drinks.groupby('continent').beer_servings.mean()

In [None]:

# multiple aggregation functions can be applied simultaneously
drinks.groupby('continent').beer_servings.agg(['count', 'mean', 'min', 'max'])

In [None]:
# specifying a column to which the aggregation function should be applied is not required
drinks.groupby('continent').mean()

In [None]:
%matplotlib inline
# side-by-side bar plot of the DataFrame directly above
drinks.groupby('continent').mean().plot(kind='bar')

### Data can be read from a local file

In [None]:
# Data from the world bank
countries = pd.read_csv('./data/energy_countries.csv')

In [None]:
countries.head()

In [None]:
countries.info()

In [None]:
countries.describe()

In [None]:
countries[['1990', '2015']].boxplot()

In [None]:
null_idx = countries.isnull().any(axis=1)
countries[null_idx]

In [None]:
countries[null_idx].count()

* fill blanks
* replace nan
* create new rows
* delete rows
* histograms
* correlations

In [None]:
countries.median()

### Custom reading from Internet

 Example taken from:

<img src="assets/images/geron.jpg" width="200" height="200">


In [None]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [None]:
fetch_housing_data()

In [None]:
import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [None]:
housing = load_housing_data()
housing.head()

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
housing.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
save_fig("better_visualization_plot")

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
    s=housing["population"]/100, label="population", figsize=(10,7),
    c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
    sharex=False)
plt.legend()


In [None]:
corr_matrix = housing.corr()

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))


In [None]:
sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()
sample_incomplete_rows

In [None]:
sample_incomplete_rows.dropna(subset=["total_bedrooms"])    # option 1

In [None]:
median = housing["total_bedrooms"].median()
sample_incomplete_rows["total_bedrooms"].fillna(median, inplace=True) # option 3
sample_incomplete_rows