# Plain text files

Read all lines at once

In [None]:
file_path = '../data/README.txt'
# /Users/tinnguyen/Dropbox/TIL6010/Tin/TIL6010/Lecture slides/Week3/data/README.txt
with open(file_path, 'r') as fp:
    lines = fp.readlines()
    print(lines)

In [None]:
lines[0]

Read one line

In [None]:
with open(file_path, 'r') as fp:
    line = fp.readline()
    print(line)

Loop over lines

In [None]:
with open(file_path, 'r') as fp:
    line = fp.readline()
    while line:
        print(line)
        line = fp.readline()

In [None]:
with open(file_path, 'r') as fp:
    for line in fp:
        print(line)
        # do some processing

<hr br>

# Json Files

In [None]:
import json

In [None]:
file_path = '../data/employee_details.json'
with open(file_path, 'r') as fp:
    data = json.load(fp)
print(type(data))
print(data)
data

In [None]:
data['firstName']

In [None]:
data['phoneNumbers']

In [None]:
data['phoneNumbers'][0].values()

*Challenge*: How to retrieve John's office phone number?

In [None]:
# we go through each phone entry, and check if there is an office number

# to loop
for phone in data['phoneNumbers']:
    # now 'phone' is an entry of data['phoneNumbers']
    print('now checking {}'.format(phone))
    # check if this phone is from office
    print(phone['type'])
    if phone['type'] == 'office':
        print(phone['number'])
    # if yes, print it out

In [None]:
phone['type']

<hr br>

# HDF5 Files

In [None]:
import h5py
import matplotlib.pyplot as plt

Open file for reading. `h5py.File` acts like a Python dictionary. (<https://docs.h5py.org/en/stable/quick.html>)  
We will load an image dataset.

In [None]:
data = h5py.File('../data/train_signs.h5', 'r')
data

See what this data includes

In [None]:
data.keys()

Get `train_set_x`

In [None]:
train = data.get('train_set_x')
# data['train_set_x']
train

1080 color images of size 64 by 64. 3 indicates 3 color channels

Show the 5th image

In [None]:
images = train[()]
plt.figure(figsize=(6,6))
plt.imshow(images[4])
plt.show()

Introduce numpy

In [None]:
data['train_set_y']

In [None]:
data['train_set_y'][4]

The correct label for the 5th image is 2.

*Challenge*: Load the dataset from <https://www.kaggle.com/olgabelitskaya/classification-of-handwritten-letters>  
Display the 10th image

<hr br>

# Pickled Files

In [None]:
import pickle

In [None]:
with open('pickle_data.pkl', 'rb') as fp:
    data = pickle.load(fp)

<hr br>

# Matlab Files

In [None]:
from scipy.io import loadmat

In [None]:
_data = loadmat('traffic_data.mat', squeeze_me=True)
_data

In [None]:
data = _data['pData']
data

In [None]:
type(data)

Get fields by using `dtype`

In [None]:
data.dtype.names

In [None]:
data['speed']

In [None]:
speed = data['speed'].item()
print(speed)
# plt.figure(figsize=(9,9))
plt.imshow(speed)
plt.show()

<hr br>

# Working with <span style="color:blue">Pandas</span>

<https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html>

In [None]:
import pandas as pd

### Import data

Let's first read data from a csv file.  
[Customer Personality Analysis](https://www.kaggle.com/imakash3011/customer-personality-analysis)

In [None]:
file_path = '../data/marketing_campaign.csv'
df = pd.read_csv(file_path, delimiter='\t')

In [None]:
df

In [None]:
type(df)

Let's look at a few lines of the data. You can use `pd.head()` function

In [None]:
df.head(10)

Show all the columns

In [None]:
df.columns

All row index

In [None]:
df.index

*Challenge*: How many rows/columns does this dataframe have?

Size/Length of the data

In [None]:
df.shape

### Accessing cells

Get a whole column

In [None]:
df['Year_Birth']

To access multiple columns, put column names in a list

In [None]:
df[['Year_Birth', 'Education']]

You can also access from rows. There are two ways, using `iloc` or `loc` function.
- [`iloc`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html) accesses a row using *sequence* index
- [`loc`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html) accesses a row using *label* index
- `at`

In [None]:
df.index

In [None]:
df.head(10)

In [None]:
df.index

In [None]:
[0,1,2,..,2239]

In [None]:
df.iloc[5]

Also, multiple rows

In [None]:
df.iloc[[5,2]]

By taking the fifth and second rows, the resulting dataframe has two rows with order [5,2]. That means, by accessing `iloc[0]` from this dataframe, we will get the 5th row of the original dataframe

In [None]:
df_small = df.iloc[[5,2]]
df_small

In [None]:
df_small.index

In [None]:
df_small.iloc[0]

Accessing using `loc`

In [None]:
df.loc

In [None]:
df_small.index

In [None]:
df_small.loc[5]

It is better to give names to indexes, if possible, or use a column as index for dataframes. For example, with this dataframe, we can use column ID as the index, since each row has a unique ID.

In [None]:
df.set_index('ID', inplace=True)

In [None]:
df.head(10)

In [None]:
df.index

In [None]:
df.iloc[4]

In [None]:
df.loc[5324]

In [None]:
df.iloc[0]

Use both rows and columns

In [None]:
df.loc[5324][['Income', 'Education']]

In [None]:
df['Income'].loc[5324]

In [None]:
df['Income'][5324]

In [None]:
df['Income'].head()

In [None]:
df['Income'].iloc[0]

In [None]:
df['Income'][0]

### Explore data

In [None]:
df.describe()

In [None]:
df.count()

In [None]:
df['Education'].unique()

Filtering

In [None]:
is_master = df['Education'] == 'Master'
is_master

In [None]:
df[is_master]

Combine multiple conditions over different columns

Challenge:
How many Bachelor have income higher than 40000
Hint:
- Use logical operator *and* (`&`) to combine conditions

In [None]:
is_graduation = df['Education'] == 'Graduation'
high_income = df['Income'] > 40000
combine_condition = is_graduation & high_income
df[combine_condition]

Use `isna()` function to find cells with missing data

In [None]:
df[df['Income'].isna()]

### GroupBy

<https://pandas.pydata.org/docs/reference/groupby.html>

For each birth year, count the number of people.

In [None]:
gb_year = df.groupby('Year_Birth')

In [None]:
gb_year.get_group(1991)

In [None]:
df.groupby('Year_Birth').size()

*Challenge*: Compare income w.r.t education?

In [None]:
# Group by Education
gp_edu = 
# Extract only relevant information, i.e. Income
income_by_group = 
# Derive some (simple) statistics of Income of each group

# Compare

*Challenge*: Is there a relationship having kids and amounts of consumed wine? How about people with teenageers?

In [None]:
# Create a new column that indicates whether a person has kids
df['HasKids'] =
# Group data by this column
gp_haskids = 
# Take only relevant information
wine_consumption_by_group = 
# Derive statistic
