In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3

import dask.dataframe as ddf

# https://www.kaggle.com/hhs/health-insurance-marketplace
# https://www.kaggle.com/shelars1985/exploring-health-insurance-marketplace

In [4]:
df_chunk = pd.read_csv('../data/csv/BenefitsCostSharing.csv', chunksize=100000)

# Create a panda series of column names
df = pd.read_csv('../data/csv/BenefitsCostSharing.csv').head(1)
df_columns = list(df.columns)
ser = pd.Series(0, index=df_columns)

# Initialise a variable to compute average bytes per chunk
ave_bytes = 0

# then we initialise our loop counter
count = 0

for index, chunk in enumerate(df_chunk):
    # We add total memory per chunk to ave_bytes
    ave_bytes += chunk.memory_usage().sum()
    df_isnull = chunk.isnull().sum()
    ser += df_isnull
    
print("Total number of chunks:",index)
ave_bytes = ave_bytes / index
print("Average bytes per loop:",ave_bytes)
print(ser)

Total number of chunks: 14
Average bytes per loop: 26485316.285714287
BenefitName                  0
BusinessYear                 0
CoinsInnTier1           316175
CoinsInnTier2          1185667
CoinsOutofNet           316175
CopayInnTier1           316175
CopayInnTier2          1185667
CopayOutofNet           316175
EHBVarReason            950054
Exclusions             1313736
Explanation            1147515
ImportDate                   0
IsCovered                59616
IsEHB                   385963
IsExclFromInnMOOP       241147
IsExclFromOonMOOP       241179
IsStateMandate         1201678
IsSubjToDedTier1       1448408
IsSubjToDedTier2       1448408
IssuerId                     0
IssuerId2                    0
LimitQty               1251896
LimitUnit              1251885
MinimumStay            1444031
PlanId                       0
QuantLimitOnSvc         970910
RowNumber                    0
SourceName                   0
StandardComponentId          0
StateCode                    0


TypeError: 'TextFileReader' object is not subscriptable

In [8]:
df_chunk = pd.read_csv('../data/csv/BenefitsCostSharing.csv', chunksize=100000)
# Initialise a variable to compute average bytes per chunk
ave_bytes = 0

# then we initialise our loop counter
count = 0
unique = 0

for index, chunk in enumerate(df_chunk):
    # We add total memory per chunk to ave_bytes
    ave_bytes += chunk.memory_usage().sum()
    count += chunk.BenefitName.count()
    unique += chunk.BenefitName.nunique() # WRONG NUMBER INVESTIGATE
    
print("Total number of chunks:",index)
ave_bytes = ave_bytes / index
print("Average bytes per loop:",ave_bytes)
print("Number of rows in BenefitsCostSharing", count)
print("Number of uniuqe Benefits", unique)

Total number of chunks: 50
Average bytes per loop: 25847983.52
Number of rows in BenefitsCostSharing 5048408
Number of uniuqe Benefits 6720


In [None]:
df.BenefitName.nunique().compute()

In [None]:
df[['LimitQty', 'MinimumStay']].describe().compute()

In [None]:
df['BenefitName'].describe().compute()

In [None]:
df[["BusinessYear","BenefitName"]].groupby('BusinessYear').BusinessYear.describe().compute()

In [None]:
def fetch_sql(query):
    conn = sqlite3.connect('../data/sql/database.sqlite')
    cur = conn.cursor()
    coords = cur.execute(query).fetchall()
    cur.close()
    conn.close()
    return coords

In [None]:
def fetch_sql_df(query):
    conn = sqlite3.connect('../data/sql/database.sqlite')
    df = pd.read_sql_query(query, conn, )
    conn.close()
    return df

Let's start our journey by analyzing how plan rates and benefits vary across states?

In [None]:
query = 'SELECT * FROM BenefitsCostSharing LIMIT 10;'
df = fetch_sql_df(query)
df

Lets find columns with null values.

In [None]:
col = table_columns['BenefitsCostSharing']
for c in col:
    query = """SELECT COUNT({}) 
                FROM BenefitsCostSharing 
                WHERE {} = '';""".format(c, c)
    q = fetch_sql(query)
    print(c, q[0][0]) # , round((q[0][0]/5048408*100),2))

How many records do we have in our dataset?

In [None]:
query = '''SELECT COUNT(*)
           FROM BenefitsCostSharing'''

total_records = fetch_sql(query)

print('Total records in file:', total_records[0][0])

How many unique benefits do we have in our dataset?

In [None]:
query = """SELECT COUNT (DISTINCT BenefitName)
           FROM BenefitsCostSharing
           """
unique_benifits = fetch_sql(query)

print('Unique benefits present in file:', unique_benifits[0][0])

We have 861 unique benefits sold in the US from 2014-16

In [None]:
query =  """SELECT COUNT(LimitQty) AS N,
            SUM(LimitQty) AS sum,
            AVG(LimitQty) AS mean,
            MIN(LimitQty) AS minimum,
            MAX(LimitQty) AS maximum
            FROM BenefitsCostSharing;"""
q = fetch_sql(query)
q

In [None]:
col = table_columns['BenefitsCostSharing']
for c in col:
    print(c)
    query = """SELECT STDEV({}) FROM BenefitsCostSharing;""".format(c)
    print(query)
    q = fetch_sql_df(query)
    print(q)
    break

In [None]:
query = """SELECT COUNT (BenefitName), COUNT (DISTINCT BenefitName), BusinessYear
           FROM BenefitsCostSharing
           GROUP BY BusinessYear;
           """

fetch_sql_df(query)

In [None]:
query = """SELECT year, name
           FROM (SELECT BusinessYear as year, Benefitname as name, COUNT(*),
                        ROW_NUMBER() OVER (PARTITION BY BusinessYear 
                                           ORDER BY COUNT(*) DESC) as rn
                 FROM BenefitsCostSharing
                 GROUP BY BusinessYear, Benefitname
                 ) t
           WHERE rn = 1;"""

fetch_sql_df(query)

In [None]:
query = """SELECT StateCode AS State, COUNT(BenefitName) AS NumBenefits
           FROM BenefitsCostSharing
           GROUP BY StateCode
           ORDER BY StateCode;"""

state_df = fetch_sql_df(query)

In [None]:
state_df.plot(kind='bar', x='State')

In [None]:
query = """SELECT State, name
           FROM (SELECT StateCode AS State, Benefitname as name, COUNT(*),
                        ROW_NUMBER() OVER (PARTITION BY StateCode 
                                           ORDER BY COUNT(*) DESC) as rn
                 FROM BenefitsCostSharing
                 GROUP BY StateCode, Benefitname
                 ) t
           WHERE rn = 1;"""

fetch_sql_df(query)