In [1]:
import pandas as pd
import requests
import json
import csv

In [2]:

state_data_url = "https://data.covid19india.org/v4/min/data.min.json"

# Fetch the JSON data
response = requests.get(state_data_url)
state = response.json()

# Create lists to store the data
data = []

# Iterate through the JSON structure
for state_code, state_data in state.items():
    # Extract state-level data
    state_delta = state_data.get('delta', {})
    state_delta7 = state_data.get('delta7', {})
    state_total = state_data.get('total', {})
    state_meta = state_data.get('meta', {})
    state_last_updated = state_meta.get('last_updated', '')
    state_population = state_meta.get('population', 0)  # Fix population handling
    state_notes = state_meta.get('notes', '')
    state_tested = state_meta.get('tested', {})

    row = {
        "StateCode": state_code,
        "StateDeltaConfirmed": state_delta.get('confirmed', 0),
        "StateDeltaDeceased": state_delta.get('deceased', 0),
        "StateDeltaRecovered": state_delta.get('recovered', 0),
        "StateDeltaVaccinated1": state_delta.get('vaccinated1', 0),
        "StateDeltaVaccinated2": state_delta.get('vaccinated2', 0),
        "StateDelta7Confirmed": state_delta7.get('confirmed', 0),
        "StateDelta7Deceased": state_delta7.get('deceased', 0),
        "StateDelta7Recovered": state_delta7.get('recovered', 0),
        "StateDelta7Vaccinated1": state_delta7.get('vaccinated1', 0),
        "StateDelta7Vaccinated2": state_delta7.get('vaccinated2', 0),
        "StateTotalConfirmed": state_total.get('confirmed', 0),
        "StateTotalDeceased": state_total.get('deceased', 0),
        "StateTotalRecovered": state_total.get('recovered', 0),
        "StateTotalTested": state_total.get('tested', 0),
        "StateTotalVaccinated1": state_total.get('vaccinated1', 0),
        "StateTotalVaccinated2": state_total.get('vaccinated2', 0),
        "StateLastUpdated": state_last_updated,
        "StatePopulation": state_population,
        "StateNotes": state_notes,
        "StateTestedSource": state_tested.get('source', ''),
        "StateTestedLastUpdated": state_tested.get('last_updated', ''),
    }

    data.append(row)


# Create a DataFrame
state_covid_data = pd.DataFrame(data)

# Save the DataFrame as a CSV file
state_covid_data.to_csv("state_covid_data.csv", index=False)

In [4]:
state_covid_data.head(2)

Unnamed: 0,StateCode,StateDeltaConfirmed,StateDeltaDeceased,StateDeltaRecovered,StateDeltaVaccinated1,StateDeltaVaccinated2,StateDelta7Confirmed,StateDelta7Deceased,StateDelta7Recovered,StateDelta7Vaccinated1,...,StateTotalDeceased,StateTotalRecovered,StateTotalTested,StateTotalVaccinated1,StateTotalVaccinated2,StateLastUpdated,StatePopulation,StateNotes,StateTestedSource,StateTestedLastUpdated
0,AN,0,0,0,3,13,3,0,5,884,...,129,7518,598033,294001,200157,2021-11-01T11:03:10+05:30,397000,,https://dhs.andaman.gov.in/NewEvents/851.pdf,
1,AP,385,4,675,20497,24137,2873,30,3590,1223010,...,14373,2047722,29518787,32976969,20375181,2021-11-01T09:54:14+05:30,52221000,,https://twitter.com/ArogyaAndhra/status/145474...,


In [5]:
state_covid_data.isnull().sum()

StateCode                 0
StateDeltaConfirmed       0
StateDeltaDeceased        0
StateDeltaRecovered       0
StateDeltaVaccinated1     0
StateDeltaVaccinated2     0
StateDelta7Confirmed      0
StateDelta7Deceased       0
StateDelta7Recovered      0
StateDelta7Vaccinated1    0
StateDelta7Vaccinated2    0
StateTotalConfirmed       0
StateTotalDeceased        0
StateTotalRecovered       0
StateTotalTested          0
StateTotalVaccinated1     0
StateTotalVaccinated2     0
StateLastUpdated          0
StatePopulation           0
StateNotes                0
StateTestedSource         0
StateTestedLastUpdated    0
dtype: int64

In [6]:
state_covid_data.duplicated().sum()

0

In [7]:
state_covid_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37 entries, 0 to 36
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   StateCode               37 non-null     object
 1   StateDeltaConfirmed     37 non-null     int64 
 2   StateDeltaDeceased      37 non-null     int64 
 3   StateDeltaRecovered     37 non-null     int64 
 4   StateDeltaVaccinated1   37 non-null     int64 
 5   StateDeltaVaccinated2   37 non-null     int64 
 6   StateDelta7Confirmed    37 non-null     int64 
 7   StateDelta7Deceased     37 non-null     int64 
 8   StateDelta7Recovered    37 non-null     int64 
 9   StateDelta7Vaccinated1  37 non-null     int64 
 10  StateDelta7Vaccinated2  37 non-null     int64 
 11  StateTotalConfirmed     37 non-null     int64 
 12  StateTotalDeceased      37 non-null     int64 
 13  StateTotalRecovered     37 non-null     int64 
 14  StateTotalTested        37 non-null     int64 
 15  StateTot

In [8]:
# for checking unique values
for i in state_covid_data:
  print(i)
  print(state_covid_data[i].unique())
  print()

StateCode
['AN' 'AP' 'AR' 'AS' 'BR' 'CH' 'CT' 'DL' 'DN' 'GA' 'GJ' 'HP' 'HR' 'JH'
 'JK' 'KA' 'KL' 'LA' 'LD' 'MH' 'ML' 'MN' 'MP' 'MZ' 'NL' 'OR' 'PB' 'PY'
 'RJ' 'SK' 'TG' 'TN' 'TR' 'TT' 'UP' 'UT' 'WB']

StateDeltaConfirmed
[    0   385     1   212     8     5    32    45    23    20    85    11
    10    95   292  7167  1172    22    63    16   579   488    26    38
     2    21   121  1009    12 12907     6   914]

StateDeltaDeceased
[  0   4   1  11 167  20   2  19 251  15]

StateDeltaRecovered
[    0   675     9   236     3    32    46     1    53    23   198    10
     8    79   345  6439  1399    51    62   610    21   450    25    45
     2   183  1183 13152     6   913]

StateDeltaVaccinated1
[     3  20497     42  19124 114694    211  21312  12482   2572  14685
    371   7867  41034   1584  17604   2010      0  53161     38    705
   5713     10    193  21420   3321    171   9399  38781  77325 750410
  53401   1065 209609]

StateDeltaVaccinated2
[    13  24137    195  37463 145827

In [9]:
state_covid_data.StateTestedLastUpdated.nunique()

1

In [10]:
# as above StateTestedLastUpdated has only one unique value , so droping that column

In [11]:
state_covid_data.drop('StateTestedLastUpdated',axis = 1,inplace = True)

In [12]:
state_covid_data.columns

Index(['StateCode', 'StateDeltaConfirmed', 'StateDeltaDeceased',
       'StateDeltaRecovered', 'StateDeltaVaccinated1', 'StateDeltaVaccinated2',
       'StateDelta7Confirmed', 'StateDelta7Deceased', 'StateDelta7Recovered',
       'StateDelta7Vaccinated1', 'StateDelta7Vaccinated2',
       'StateTotalConfirmed', 'StateTotalDeceased', 'StateTotalRecovered',
       'StateTotalTested', 'StateTotalVaccinated1', 'StateTotalVaccinated2',
       'StateLastUpdated', 'StatePopulation', 'StateNotes',
       'StateTestedSource'],
      dtype='object')

In [13]:
state_covid_data[state_covid_data['StateCode']=='TT']

Unnamed: 0,StateCode,StateDeltaConfirmed,StateDeltaDeceased,StateDeltaRecovered,StateDeltaVaccinated1,StateDeltaVaccinated2,StateDelta7Confirmed,StateDelta7Deceased,StateDelta7Recovered,StateDelta7Vaccinated1,...,StateTotalConfirmed,StateTotalDeceased,StateTotalRecovered,StateTotalTested,StateTotalVaccinated1,StateTotalVaccinated2,StateLastUpdated,StatePopulation,StateNotes,StateTestedSource
33,TT,12907,251,13152,750410,933460,96071,3727,101753,15883780,...,34285612,458470,33661339,609201294,732371508,330752697,2021-11-01T11:20:24+05:30,1332900000,,https://twitter.com/ICMRDELHI/status/145501364...


In [14]:
#as per observation in unique value , we can see some outlier present in the data and they all belongs to STATECODE 'TT' row.
# dropping row of statecode 'TT' .


In [15]:
state_covid_data.drop(33,axis=0,inplace=True)

In [16]:
state_covid_data[state_covid_data['StateCode']=='TT']

Unnamed: 0,StateCode,StateDeltaConfirmed,StateDeltaDeceased,StateDeltaRecovered,StateDeltaVaccinated1,StateDeltaVaccinated2,StateDelta7Confirmed,StateDelta7Deceased,StateDelta7Recovered,StateDelta7Vaccinated1,...,StateTotalConfirmed,StateTotalDeceased,StateTotalRecovered,StateTotalTested,StateTotalVaccinated1,StateTotalVaccinated2,StateLastUpdated,StatePopulation,StateNotes,StateTestedSource


In [17]:
state_covid_data['StateCode'].nunique()

36

In [18]:
# again checking unique values after droping row
for i in state_covid_data:
  print(i)
  print(state_covid_data[i].unique())
  print()

StateCode
['AN' 'AP' 'AR' 'AS' 'BR' 'CH' 'CT' 'DL' 'DN' 'GA' 'GJ' 'HP' 'HR' 'JH'
 'JK' 'KA' 'KL' 'LA' 'LD' 'MH' 'ML' 'MN' 'MP' 'MZ' 'NL' 'OR' 'PB' 'PY'
 'RJ' 'SK' 'TG' 'TN' 'TR' 'UP' 'UT' 'WB']

StateDeltaConfirmed
[   0  385    1  212    8    5   32   45   23   20   85   11   10   95
  292 7167 1172   22   63   16  579  488   26   38    2   21  121 1009
   12    6  914]

StateDeltaDeceased
[  0   4   1  11 167  20   2  19  15]

StateDeltaRecovered
[   0  675    9  236    3   32   46    1   53   23  198   10    8   79
  345 6439 1399   51   62  610   21  450   25   45    2  183 1183    6
  913]

StateDeltaVaccinated1
[     3  20497     42  19124 114694    211  21312  12482   2572  14685
    371   7867  41034   1584  17604   2010      0  53161     38    705
   5713     10    193  21420   3321    171   9399  38781  77325  53401
   1065 209609]

StateDeltaVaccinated2
[    13  24137    195  37463 145827   1282  39393  11839     20  12404
  62096   8192  15801  51488  45202  28163  11914   

In [19]:
#  as we cant perform aggrigation on below columns and its has some irrelavant information in there columns so ,droping below columns .
# 1 StateNotes
# 2 StateTestedSource

In [20]:
state_covid_data.drop('StateNotes',axis = 1,inplace = True)

In [21]:
state_covid_data.drop('StateTestedSource',axis = 1,inplace = True)

In [22]:
state_covid_data.columns

Index(['StateCode', 'StateDeltaConfirmed', 'StateDeltaDeceased',
       'StateDeltaRecovered', 'StateDeltaVaccinated1', 'StateDeltaVaccinated2',
       'StateDelta7Confirmed', 'StateDelta7Deceased', 'StateDelta7Recovered',
       'StateDelta7Vaccinated1', 'StateDelta7Vaccinated2',
       'StateTotalConfirmed', 'StateTotalDeceased', 'StateTotalRecovered',
       'StateTotalTested', 'StateTotalVaccinated1', 'StateTotalVaccinated2',
       'StateLastUpdated', 'StatePopulation'],
      dtype='object')

In [23]:
state_covid_data.shape

(36, 19)

In [24]:
# for checking  again unique values
for i in state_covid_data:
  print(i)
  print(state_covid_data[i].unique())
  print()

StateCode
['AN' 'AP' 'AR' 'AS' 'BR' 'CH' 'CT' 'DL' 'DN' 'GA' 'GJ' 'HP' 'HR' 'JH'
 'JK' 'KA' 'KL' 'LA' 'LD' 'MH' 'ML' 'MN' 'MP' 'MZ' 'NL' 'OR' 'PB' 'PY'
 'RJ' 'SK' 'TG' 'TN' 'TR' 'UP' 'UT' 'WB']

StateDeltaConfirmed
[   0  385    1  212    8    5   32   45   23   20   85   11   10   95
  292 7167 1172   22   63   16  579  488   26   38    2   21  121 1009
   12    6  914]

StateDeltaDeceased
[  0   4   1  11 167  20   2  19  15]

StateDeltaRecovered
[   0  675    9  236    3   32   46    1   53   23  198   10    8   79
  345 6439 1399   51   62  610   21  450   25   45    2  183 1183    6
  913]

StateDeltaVaccinated1
[     3  20497     42  19124 114694    211  21312  12482   2572  14685
    371   7867  41034   1584  17604   2010      0  53161     38    705
   5713     10    193  21420   3321    171   9399  38781  77325  53401
   1065 209609]

StateDeltaVaccinated2
[    13  24137    195  37463 145827   1282  39393  11839     20  12404
  62096   8192  15801  51488  45202  28163  11914   