# Python for data science



 * `json`
 * `joblib`
 * `csv`
 * `pandas`
 * `keras`

In [1]:
import sys
!{sys.executable} -m pip install joblib pandas sklearn keras



## Read and write JSON using `json`
```
json.dump(<obj>, <file>)
```

```
json.load(<file>)
```

In [2]:
import json

data = [
    {
        "x": 0,
        "y": 1
    }, {
        "x": 1,
        "y": 2
    },{
        "x": 2,
        "y": 3
    }
]

with open("data.json", "w") as f:
    json.dump(data, f)

In [3]:
with open("data.json") as f:
    data2 = json.load(f)
    
assert data == data2

## Parallel processing using `joblib`


```
Parallel(n_jobs=<number of parallel jobs>)(<jobs>)
```
```
delayed(<function>)(<arguments>)
```

### Set up dataset

In [4]:
import json
import random
from math import sqrt

data = [
    {
        "x1": x1,
        "x2": x2,
        "y": 1 if sqrt(x1 ** 2 + x2 ** 2) >= 100 else 0
    } for _ in range(1000000) 
    if (x1 := random.randrange(0,100)) is not None 
    if (x2 := random.randrange(0,100)) is not None
]

with open("data.json", "w") as f:
    json.dump(data, f)

### Process dataset

In [5]:
from joblib import Parallel, delayed

with open("data.json") as f:
    data2 = json.load(f)
    
headers = ["x1", "x2", "y"]

def obj_to_row(obj):
    return [obj[header] for header in headers]

rows = Parallel(n_jobs=4)(delayed(obj_to_row)(obj) for obj in data2)

## Reading and write CSV using `csv`

In [6]:
import random
import csv

with open("data.csv", "w", newline="") as f:
    w = csv.writer(f)
    w.writerow(headers)
    for row in rows:
        w.writerow([random.choice([row[0], None]), row[1], row[2]])
        

In [7]:
with open("data.csv", newline="") as f:
    r = csv.reader(f)
    print(next(r))
    for _ in range(5):
        print(next(r))

['x1', 'x2', 'y']
['', '81', '0']
['', '29', '0']
['32', '63', '0']
['33', '10', '0']
['16', '53', '0']


## Processing dataframe using `pandas`


In [8]:
import pandas as pd

df = pd.read_csv("data.csv")

In [9]:
df.head()

Unnamed: 0,x1,x2,y
0,,81,0
1,,29,0
2,32.0,63,0
3,33.0,10,0
4,16.0,53,0


In [10]:
df.describe()

Unnamed: 0,x1,x2,y
count,499937.0,1000000.0,1000000.0
mean,49.519325,49.498514,0.204899
std,28.870636,28.875909,0.403628
min,0.0,0.0,0.0
25%,24.0,24.0,0.0
50%,50.0,49.0,0.0
75%,75.0,75.0,0.0
max,99.0,99.0,1.0


In [11]:
df.to_csv("df.csv")

### Data cleanup

In [12]:
df_cleanedup = df[df["x1"].notna()]
df_cleanedup.head()

Unnamed: 0,x1,x2,y
2,32.0,63,0
3,33.0,10,0
4,16.0,53,0
5,75.0,51,0
6,51.0,9,0


In [13]:
df_cleanedup["y"].value_counts()

0    397549
1    102388
Name: y, dtype: int64

### Down sample majority class

In [14]:
df_cleanedup_majority = df_cleanedup[df_cleanedup["y"] == 0]
df_cleanedup_minority = df_cleanedup[df_cleanedup["y"] == 1]
 

df_cleanedup_majority_downsampled = df_cleanedup_majority.sample(n=len(df_cleanedup_minority["y"].index))
 
df_downsampled = pd.concat([df_cleanedup_majority_downsampled, df_cleanedup_minority])
 
df_downsampled["y"].value_counts()

1    102388
0    102388
Name: y, dtype: int64

## Build a machine learning model using `sklearn` and `keras`


### Prepare training and test data

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X_unnormalized = df_downsampled[["x1", "x2"]]
y = df_downsampled["y"]

scaler = MinMaxScaler()
scaler.fit(X_unnormalized)
X = scaler.transform(X_unnormalized)

X


array([[0.18181818, 0.24242424],
       [0.32323232, 0.34343434],
       [0.32323232, 0.26262626],
       ...,
       [0.87878788, 0.75757576],
       [0.95959596, 0.85858586],
       [0.73737374, 0.8989899 ]])

In [16]:
X.shape

(204776, 2)

In [17]:
y.shape

(204776,)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [19]:
X_train.shape

(163820, 2)

In [20]:
X_test.shape

(40956, 2)

In [21]:
y_train.shape

(163820,)

In [22]:
y_test.shape

(40956,)

### Build a model

In [23]:
from keras.layers import Input, Dense
from keras.models import Model
import keras.utils

input = Input(shape=(2,))
x = Dense(4, activation="relu")(input)
x = Dense(4, activation="relu")(x)
output = Dense(1, activation="sigmoid")(x)
model = Model(input, output)

model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 2)]               0         
_________________________________________________________________
dense (Dense)                (None, 4)                 12        
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 20        
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 5         
Total params: 37
Trainable params: 37
Non-trainable params: 0
_________________________________________________________________


### Train a model

In [24]:
model.compile(
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer=keras.optimizers.RMSprop(),
    metrics=["accuracy"],
)
model.evaluate(X_test, y_test)



[0.7237554788589478, 0.6049907207489014]

In [25]:
model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f2800641b80>

### Evaluate a model

In [26]:
model.evaluate(X_test, y_test)



[0.5271095633506775, 0.9489940404891968]