#### Working with json files

Goals -

- Learn how to decompress a file using the command line
- Open a file and load its content(json file)
- Explore our dataset and load into a DataFrame

Prepare data - 
- Open (A compressed json file) - Comand line, context handler
- Explore (Json syntax and stackoverflow) - key-value pair
- Import

In [123]:
import gzip
import json
import pandas as pd
import wqet_grader

##### Terminal commands for json

In [124]:
# %%bash
# pwd
# cd data
# gzip -dkf poland-bankruptcy-data-2009.json.gz

In [125]:
%%bash
cd data
head poland-bankruptcy-data-2009.json

{
  "schema": {
    "fields": [
      {
        "name": "company_id",
        "type": "integer"
      },
      {
        "name": "feat_1",
        "type": "number"


#### Exploring json

In [126]:
with open('./data/poland-bankruptcy-data-2009.json') as read_file:
    print(type(read_file))
    poland_Data = json.load(read_file)
print(type(poland_Data))

<class '_io.TextIOWrapper'>
<class 'dict'>


In [127]:
print(poland_Data.keys())

dict_keys(['schema', 'data', 'metadata'])


In [128]:
poland_Data["metadata"]

{'title': 'Ensemble Boosted Trees with Synthetic Features Generation in Application to Bankruptcy Prediction',
 'authors': 'Zieba, M., Tomczak, S. K., & Tomczak, J. M.',
 'journal': 'Expert Systems with Applications',
 'publicationYear': 2016,
 'dataYear': 2009,
 'articleLink': 'doi:10.1016/j.eswa.2016.04.001',
 'datasetLink': 'https://archive.ics.uci.edu/ml/datasets/Polish+companies+bankruptcy+data'}

In [129]:
print(poland_Data["schema"].keys())
print(type(poland_Data["data"]))


dict_keys(['fields', 'primaryKey', 'pandas_version'])
<class 'list'>


In [130]:
poland_Data["data"][0]

{'company_id': 1,
 'feat_1': 0.17419,
 'feat_2': 0.41299,
 'feat_3': 0.14371,
 'feat_4': 1.348,
 'feat_5': -28.982,
 'feat_6': 0.60383,
 'feat_7': 0.21946,
 'feat_8': 1.1225,
 'feat_9': 1.1961,
 'feat_10': 0.46359,
 'feat_11': 0.21946,
 'feat_12': 0.53139,
 'feat_13': 0.14233,
 'feat_14': 0.21946,
 'feat_15': 592.24,
 'feat_16': 0.6163,
 'feat_17': 2.4213,
 'feat_18': 0.21946,
 'feat_19': 0.12272,
 'feat_20': 37.573,
 'feat_21': 0.9969,
 'feat_22': 0.2951,
 'feat_23': 0.097402,
 'feat_24': 0.75641,
 'feat_25': 0.46359,
 'feat_26': 0.50669,
 'feat_27': 1.9737,
 'feat_28': 0.32417,
 'feat_29': 5.9473,
 'feat_30': 0.22493,
 'feat_31': 0.12272,
 'feat_32': 100.82,
 'feat_33': 3.6203,
 'feat_34': 0.71453,
 'feat_35': 0.2951,
 'feat_36': 1.8079,
 'feat_37': 123140.0,
 'feat_38': 0.46359,
 'feat_39': 0.16501,
 'feat_40': 0.21282,
 'feat_41': 0.041124,
 'feat_42': 0.16501,
 'feat_43': 95.682,
 'feat_44': 58.109,
 'feat_45': 0.94621,
 'feat_46': 0.90221,
 'feat_47': 44.941,
 'feat_48': 0.26003,

In [131]:
# Calculate the number of companies included in the dataset
len(poland_Data["data"])

9977

In [132]:
# Calculate numbber of features in the associated with company name
len(poland_Data["data"][0])

66

In [133]:
for item in poland_Data["data"]:
    if len(item) != 66:
        print("ALERT!")
        

#### Loading the zipped file directly with python gzip

In [134]:
with gzip.open("data/poland-bankruptcy-data-2009.json.gz", "r") as read_file:
    poland_Data_gz = json.load(read_file) 
print(type(poland_Data_gz))   

<class 'dict'>


#### Explore the data

In [135]:
poland_Data_gz["metadata"]

{'title': 'Ensemble Boosted Trees with Synthetic Features Generation in Application to Bankruptcy Prediction',
 'authors': 'Zieba, M., Tomczak, S. K., & Tomczak, J. M.',
 'journal': 'Expert Systems with Applications',
 'publicationYear': 2016,
 'dataYear': 2009,
 'articleLink': 'doi:10.1016/j.eswa.2016.04.001',
 'datasetLink': 'https://archive.ics.uci.edu/ml/datasets/Polish+companies+bankruptcy+data'}

In [136]:
print(poland_Data_gz.keys())
print(len(poland_Data_gz["data"]))
print(len(poland_Data["data"][0]))

dict_keys(['schema', 'data', 'metadata'])
9977
66


In [137]:
df = pd.DataFrame(poland_Data["data"])
df.head()

Unnamed: 0,company_id,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,feat_56,feat_57,feat_58,feat_59,feat_60,feat_61,feat_62,feat_63,feat_64,bankrupt
0,1,0.17419,0.41299,0.14371,1.348,-28.982,0.60383,0.21946,1.1225,1.1961,...,0.16396,0.37574,0.83604,7e-06,9.7145,6.2813,84.291,4.3303,4.0341,False
1,2,0.14624,0.46038,0.2823,1.6294,2.5952,0.0,0.17185,1.1721,1.6018,...,0.027516,0.271,0.90108,0.0,5.9882,4.1103,102.19,3.5716,5.95,False
2,3,0.000595,0.22612,0.48839,3.1599,84.874,0.19114,0.004572,2.9881,1.0077,...,0.007639,0.000881,0.99236,0.0,6.7742,3.7922,64.846,5.6287,4.4581,False
3,5,0.18829,0.41504,0.34231,1.9279,-58.274,0.0,0.23358,1.4094,1.3393,...,0.17648,0.32188,0.82635,0.073039,2.5912,7.0756,100.54,3.6303,4.6375,False
4,6,0.18206,0.55615,0.32191,1.6045,16.314,0.0,0.18206,0.79808,1.8126,...,0.55577,0.41019,0.46957,0.029421,8.4553,3.3488,107.24,3.4036,12.454,False


#### Create a wrangle function

In [138]:
def wrangle(filename):
    with gzip.open(filename, "r") as read_file:
        data = json.load(read_file)
        df = pd.DataFrame().from_dict(data["data"]).set_index("company_id")
    return df

In [139]:
df = wrangle("data/poland-bankruptcy-data-2009.json.gz")
df.head()

Unnamed: 0_level_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,...,feat_56,feat_57,feat_58,feat_59,feat_60,feat_61,feat_62,feat_63,feat_64,bankrupt
company_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.17419,0.41299,0.14371,1.348,-28.982,0.60383,0.21946,1.1225,1.1961,0.46359,...,0.16396,0.37574,0.83604,7e-06,9.7145,6.2813,84.291,4.3303,4.0341,False
2,0.14624,0.46038,0.2823,1.6294,2.5952,0.0,0.17185,1.1721,1.6018,0.53962,...,0.027516,0.271,0.90108,0.0,5.9882,4.1103,102.19,3.5716,5.95,False
3,0.000595,0.22612,0.48839,3.1599,84.874,0.19114,0.004572,2.9881,1.0077,0.67566,...,0.007639,0.000881,0.99236,0.0,6.7742,3.7922,64.846,5.6287,4.4581,False
5,0.18829,0.41504,0.34231,1.9279,-58.274,0.0,0.23358,1.4094,1.3393,0.58496,...,0.17648,0.32188,0.82635,0.073039,2.5912,7.0756,100.54,3.6303,4.6375,False
6,0.18206,0.55615,0.32191,1.6045,16.314,0.0,0.18206,0.79808,1.8126,0.44385,...,0.55577,0.41019,0.46957,0.029421,8.4553,3.3488,107.24,3.4036,12.454,False
