In [1]:
import json
import pandas as pd
import numpy as np
import ocifs
import pickle
import sklearn

from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, OneHotEncoder

In [2]:
data_string = """[
 {
   "id": 1,
   "timestamp": "14/10/2020 16:07",
   "example-weight": 0.5,
   "height": 1.73,
   "description": "aged, grey-white hair, blue eyes, stern disposition",
   "transport": "police-box",
   "disposition": "Good",
   "extra-a": 4.83881,
   "extra-b": -0.7685,
   "extra-c": 0.87706
 },
 {
   "id": 2,
   "timestamp": "15/10/2020 16:07",
   "example-weight": 0.5,
   "height": 1.73,
   "description": "impish, black hair, blue eyes, unkempt",
   "transport": "police-box",
   "disposition": "Good",
   "extra-a": 1.10026,
   "extra-b": 0.51655,
   "extra-c": 0.9632
 },
 {
   "id": 3,
   "timestamp": "16/10/2020 16:07",
   "example-weight": 1,
   "height": 1.9,
   "description": "grey curly hair, frilly shirt, cape, blue eyes",
   "transport": "police-box",
   "disposition": "Good",
   "extra-a": 0.78601,
   "extra-b": 0.49001,
   "extra-c": 3.03926
 },
 {
   "id": 4,
   "timestamp": "17/10/2020 16:07",
   "example-weight": 1,
   "height": 1.91,
   "description": "brown curly hair, long multicoloured scarf, jelly babies",
   "transport": "police-box",
   "disposition": "Good",
   "extra-a": 3.56195,
   "extra-b": 2.11667,
   "extra-c": -0.55366
 },
 {
   "id": 5,
   "timestamp": "18/10/2020 16:07",
   "example-weight": 1,
   "height": 1.86,
   "description": "sandy hair, cricketing attire, celery",
   "transport": "police-box",
   "disposition": "Good",
   "extra-a": 4.36924,
   "extra-b": 0.95905,
   "extra-c": -0.16788
 },
 {
   "id": 6,
   "timestamp": "19/10/2020 16:07",
   "example-weight": 1,
   "height": 1.83,
   "description": "multicoloured coat, erratic behaviour",
   "transport": "police-box",
   "disposition": "Good",
   "extra-a": 1.36555,
   "extra-b": -0.10767,
   "extra-c": 2.51167
 },
 {
   "id": 7,
   "timestamp": "20/10/2020 16:07",
   "example-weight": 1,
   "height": 1.68,
   "description": "brown hair, question mark pullover, umbrella",
   "transport": "police-box",
   "disposition": "Good",
   "extra-a": 0.20671,
   "extra-b": 2.86232,
   "extra-c": -0.65055
 },
 {
   "id": 8,
   "timestamp": "21/10/2020 16:07",
   "example-weight": 2,
   "height": 1.75,
   "description": "brown hair, green velvet coat",
   "transport": "police-box",
   "disposition": "Good",
   "extra-a": 2.61811,
   "extra-b": 2.14574,
   "extra-c": -0.34658
 },
 {
   "id": 9,
   "timestamp": "22/10/2020 16:07",
   "example-weight": 3,
   "height": 1.83,
   "description": "short cropped brown hair, leather jacket",
   "transport": "police-box",
   "disposition": "Good",
   "extra-a": 5.97242,
   "extra-b": -0.05868,
   "extra-c": -3.71717
 },
 {
   "id": 10,
   "timestamp": "23/10/2020 16:07",
   "example-weight": 3,
   "height": 1.85,
   "description": "brown eyes, brown hair, striped suit with overcoat",
   "transport": "police-box",
   "disposition": "Good",
   "extra-a": 4.60461,
   "extra-b": 2.0726,
   "extra-c": 2.84761
 },
 {
   "id": 11,
   "timestamp": "24/10/2020 16:07",
   "example-weight": 1,
   "height": 1.71,
   "description": "greying hair, brown eyes, black and grey goatee",
   "transport": "white-cube",
   "disposition": "Bad",
   "extra-a": -0.46936,
   "extra-b": -0.93174,
   "extra-c": -2.72745
 },
 {
   "id": 12,
   "timestamp": "25/10/2020 16:07",
   "example-weight": 1,
   "height": 1.77,
   "description": "dark brown hair, black goatee",
   "transport": "white-cube",
   "disposition": "Bad",
   "extra-a": -1.73552,
   "extra-b": 1.30103,
   "extra-c": -3.92933
 },
 {
   "id": 13,
   "timestamp": "26/10/2020 16:07",
   "example-weight": 4,
   "height": 1.61,
   "description": "black hair, blue eyes, victorian dress",
   "transport": "white-cube",
   "disposition": "Bad",
   "extra-a": -0.17391,
   "extra-b": 1.24752,
   "extra-c": -1.83719
 },
 {
   "id": 14,
   "timestamp": "27/10/2020 16:07",
   "example-weight": 1,
   "height": 1.75,
   "description": "bald, inexplicable british accent",
   "transport": "spaceship",
   "disposition": "Good",
   "extra-a": 2.75697,
   "extra-b": 0.40616,
   "extra-c": -5.09777
 },
 {
   "id": 15,
   "timestamp": "28/10/2020 16:07",
   "example-weight": 1,
   "height": 1.89,
   "description": "brown hair, brown goatee",
   "transport": "spaceship",
   "disposition": "Good",
   "extra-a": 5.82101,
   "extra-b": -2.52022,
   "extra-c": -3.26192
 },
 {
   "id": 16,
   "timestamp": "29/10/2020 16:07",
   "example-weight": 1,
   "height": 1.78,
   "description": "golden eyes, brown hair, pale complexion",
   "transport": "spaceship",
   "disposition": "Good",
   "extra-a": 1.71953,
   "extra-b": -0.72907,
   "extra-c": -0.24401
 },
 {
   "id": 17,
   "timestamp": "30/10/2020 16:07",
   "example-weight": 1,
   "height": 1.6,
   "description": "black eyes, long dark hair",
   "transport": "spaceship",
   "disposition": "Good",
   "extra-a": 0.58071,
   "extra-b": -0.43875,
   "extra-c": 2.57832
 },
 {
   "id": 18,
   "timestamp": "31/10/2020 16:07",
   "example-weight": 2,
   "height": 1.65,
   "description": "pale skin, cybernetic augmentation",
   "transport": "spaceship",
   "disposition": "Bad",
   "extra-a": -2.80532,
   "extra-b": -1.52983,
   "extra-c": -0.53828
 },
 {
   "id": 19,
   "timestamp": "01/11/2020 16:07",
   "example-weight": 1,
   "height": 1.78,
   "description": "golden eyes, brown hair, pale complexion",
   "transport": "spaceship",
   "disposition": "Bad",
   "extra-a": -0.51847,
   "extra-b": -0.46098,
   "extra-c": 0.51498
 },
 {
   "id": 20,
   "timestamp": "02/11/2020 16:07",
   "example-weight": 3,
   "height": 1.75,
   "description": "bald, pointy ears",
   "transport": "spaceship",
   "disposition": "Bad",
   "extra-a": 1.32672,
   "extra-b": 1.21234,
   "extra-c": -0.63983
 }
]"""



In [3]:
#df = pd.read_json(data_string)
df = pd.read_json("oci://<bucket-name>@<tenancy-name>/<path-to-data>/columnar-example.json",
                   storage_options={"config": "~/.oci/config"})

In [4]:
df

Unnamed: 0,id,timestamp,example-weight,height,description,transport,disposition,extra-a,extra-b,extra-c
0,1,2020-10-14 16:07:00,0.5,1.73,"aged, grey-white hair, blue eyes, stern dispos...",police-box,Good,4.83881,-0.7685,0.87706
1,2,2020-10-15 16:07:00,0.5,1.73,"impish, black hair, blue eyes, unkempt",police-box,Good,1.10026,0.51655,0.9632
2,3,2020-10-16 16:07:00,1.0,1.9,"grey curly hair, frilly shirt, cape, blue eyes",police-box,Good,0.78601,0.49001,3.03926
3,4,2020-10-17 16:07:00,1.0,1.91,"brown curly hair, long multicoloured scarf, je...",police-box,Good,3.56195,2.11667,-0.55366
4,5,2020-10-18 16:07:00,1.0,1.86,"sandy hair, cricketing attire, celery",police-box,Good,4.36924,0.95905,-0.16788
5,6,2020-10-19 16:07:00,1.0,1.83,"multicoloured coat, erratic behaviour",police-box,Good,1.36555,-0.10767,2.51167
6,7,2020-10-20 16:07:00,1.0,1.68,"brown hair, question mark pullover, umbrella",police-box,Good,0.20671,2.86232,-0.65055
7,8,2020-10-21 16:07:00,2.0,1.75,"brown hair, green velvet coat",police-box,Good,2.61811,2.14574,-0.34658
8,9,2020-10-22 16:07:00,3.0,1.83,"short cropped brown hair, leather jacket",police-box,Good,5.97242,-0.05868,-3.71717
9,10,2020-10-23 16:07:00,3.0,1.85,"brown eyes, brown hair, striped suit with over...",police-box,Good,4.60461,2.0726,2.84761


In [5]:
column_trans = ColumnTransformer([
    ("description_bow", CountVectorizer(), "description"), 
    ("transport", OneHotEncoder(dtype='int'), ["transport"]),
    ("others", FunctionTransformer(feature_names_out="one-to-one"), ["height", "extra-a", "extra-b", "extra-c"])
], verbose_feature_names_out=False)

In [6]:
data = column_trans.fit_transform(df)

In [7]:
label_enc = LabelEncoder()
label = label_enc.fit_transform(df["disposition"])

In [8]:
lr = LogisticRegression()

In [9]:
lr.fit(data, label)

In [10]:
predictions = lr.predict(data)

In [11]:
print(
    f"Classification report for classifier {lr}:\n"
    f"{classification_report(label, predictions)}\n"
)

Classification report for classifier LogisticRegression():
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00        14

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20




In [12]:
with open("model.skl", "wb") as f:
    pickle.dump(lr, f)