# python for data science



 * `json`
 * `joblib`
 * `csv`
 * `pandas`
 * `keras`

In [1]:
import sys
!{sys.executable} -m pip install joblib pandas sklearn keras



## Read and write JSON using `json`

In [2]:
import json

data = [
    {
        "x": 0,
        "y": 1
    }, {
        "x": 1,
        "y": 2
    },{
        "x": 2,
        "y": 3
    }
]

with open("data.json", "w") as f:
    json.dump(data, f)

In [3]:
with open("data.json") as f:
    data2 = json.load(f)
    
assert data == data2

## Parallel processing using `joblib`

In [4]:
import json
import random
from math import sqrt

data = [
    {
        "x1": x1,
        "x2": x2,
        "y": 1 if sqrt(x1 ** 2 + x2 ** 2) >= 100 else 0
    } for _ in range(1000000) if (x1 := random.randrange(0,100)) is not None if (x2 := random.randrange(0,100)) is not None
]

with open("data.json", "w") as f:
    json.dump(data, f)
    

with open("data.json") as f:
    data2 = json.load(f)

In [5]:
from joblib import Parallel, delayed

headers = ["x1", "x2", "y"]

def obj_to_row(obj):
    return [obj[header] for header in headers]

rows = Parallel(n_jobs=4)(delayed(obj_to_row)(obj) for obj in data2)

## Reading and write CSV using `csv`

In [6]:
import random
import csv

with open("data.csv", "w", newline="") as f:
    w = csv.writer(f)
    w.writerow(headers)
    for row in rows:
        w.writerow([random.choice([row[0], None]), row[1], row[2]])
        

In [7]:
with open("data.csv", newline="") as f:
    r = csv.reader(f)
    print(next(r))
    for _ in range(5):
        print(next(r))

['x1', 'x2', 'y']
['', '66', '0']
['25', '44', '0']
['39', '37', '0']
['13', '68', '0']
['92', '99', '1']


## Processing dataframe using `pandas`


In [8]:
import pandas as pd

df = pd.read_csv("data.csv")

In [9]:
df.head()

Unnamed: 0,x1,x2,y
0,,66,0
1,25.0,44,0
2,39.0,37,0
3,13.0,68,0
4,92.0,99,1


In [10]:
df.describe()

Unnamed: 0,x1,x2,y
count,499875.0,1000000.0,1000000.0
mean,49.478784,49.5285,0.205337
std,28.877488,28.868384,0.403948
min,0.0,0.0,0.0
25%,24.0,25.0,0.0
50%,49.0,50.0,0.0
75%,74.0,75.0,0.0
max,99.0,99.0,1.0


In [11]:
df.to_csv("df.csv")

Data cleanup

In [12]:
df_cleanedup = df[df["x1"].notna()]
df_cleanedup.head()

Unnamed: 0,x1,x2,y
1,25.0,44,0
2,39.0,37,0
3,13.0,68,0
4,92.0,99,1
5,96.0,28,1


In [13]:
df_cleanedup["y"].value_counts()

0    397326
1    102549
Name: y, dtype: int64

Down sample majority class

In [14]:
df_cleanedup_majority = df_cleanedup[df_cleanedup["y"] == 0]
df_cleanedup_minority = df_cleanedup[df_cleanedup["y"] == 1]
 

df_cleanedup_majority_downsampled = df_cleanedup_majority.sample(n=len(df_cleanedup_minority["y"].index))
 
df_downsampled = pd.concat([df_cleanedup_majority_downsampled, df_cleanedup_minority])
 
df_downsampled["y"].value_counts()

1    102549
0    102549
Name: y, dtype: int64

## Build a machine learning model using `sklearn` and `keras`


prepare training and test data

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X_unnormalized = df_downsampled[["x1", "x2"]]
y = df_downsampled["y"]

scaler = MinMaxScaler()
scaler.fit(X_unnormalized)
X = scaler.transform(X_unnormalized)

X


array([[0.03030303, 0.07070707],
       [0.45454545, 0.22222222],
       [0.32323232, 0.57575758],
       ...,
       [0.98989899, 0.46464646],
       [1.        , 0.8989899 ],
       [0.45454545, 0.94949495]])

In [16]:
X.shape

(205098, 2)

In [17]:
y.shape

(205098,)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [19]:
X_train.shape

(164078, 2)

In [20]:
X_test.shape

(41020, 2)

In [21]:
y_train.shape

(164078,)

In [22]:
y_test.shape

(41020,)

build model

In [23]:
from keras.layers import Input, Dense
from keras.models import Model
import keras.utils

input = Input(shape=(2,))
x = Dense(32, activation="relu")(input)
x = Dense(32, activation="relu")(x)
output = Dense(1, activation="sigmoid")(x)
model = Model(input, output)

model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 2)]               0         
_________________________________________________________________
dense (Dense)                (None, 32)                96        
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 1,185
Trainable params: 1,185
Non-trainable params: 0
_________________________________________________________________


train model

In [24]:
model.compile(
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer=keras.optimizers.RMSprop(),
    metrics=["accuracy"],
)
model.evaluate(X_test, y_test)



[0.7270117402076721, 0.5061677098274231]

In [25]:
model.fit(X_train, y_train, epochs=1)



<tensorflow.python.keras.callbacks.History at 0x7fbbb8782e80>

Evaluate model

In [26]:
model.evaluate(X_test, y_test)



[0.5316098928451538, 0.9373964071273804]