# Covid-19 Daily Reports 

### Import the required packages

In [1]:
import os
import glob
import pandas as pd
import numpy as np
import csv
import calendar

In [2]:
os.getcwdb()      #check the current working directory

b'/Users/tejashree/Desktop/Data Science Projects/COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports'

In [3]:
os.chdir(os.getcwdb())   # set the current working directory path to os

### Read the dataset 

#### Extract all the csv files with the '*-2020.csv' format

In [4]:
extension = 'csv'

all_filenames = [i for i in glob.glob('*-*-{}.{}'.format(2020,extension))]

In [5]:
len(all_filenames)

301

In [6]:
all_filenames # not in a sorted manner

['02-26-2020.csv',
 '02-27-2020.csv',
 '06-07-2020.csv',
 '06-06-2020.csv',
 '10-01-2020.csv',
 '09-22-2020.csv',
 '09-23-2020.csv',
 '04-08-2020.csv',
 '04-09-2020.csv',
 '02-18-2020.csv',
 '02-19-2020.csv',
 '03-24-2020.csv',
 '03-25-2020.csv',
 '08-20-2020.csv',
 '08-21-2020.csv',
 '11-03-2020.csv',
 '11-02-2020.csv',
 '07-05-2020.csv',
 '07-04-2020.csv',
 '04-02-2020.csv',
 '04-03-2020.csv',
 '09-28-2020.csv',
 '07-31-2020.csv',
 '07-30-2020.csv',
 '09-29-2020.csv',
 '08-14-2020.csv',
 '08-15-2020.csv',
 '03-10-2020.csv',
 '03-11-2020.csv',
 '09-16-2020.csv',
 '09-17-2020.csv',
 '05-01-2020.csv',
 '11-09-2020.csv',
 '11-08-2020.csv',
 '02-12-2020.csv',
 '02-13-2020.csv',
 '10-16-2020.csv',
 '10-17-2020.csv',
 '05-23-2020.csv',
 '05-22-2020.csv',
 '08-09-2020.csv',
 '06-10-2020.csv',
 '06-11-2020.csv',
 '08-08-2020.csv',
 '04-21-2020.csv',
 '04-20-2020.csv',
 '10-28-2020.csv',
 '10-29-2020.csv',
 '07-12-2020.csv',
 '07-13-2020.csv',
 '11-14-2020.csv',
 '11-15-2020.csv',
 '03-07-2020

In [7]:
all_filenames.sort()
for i in range(len(all_filenames)):
    if i+1<len(all_filenames):
        t1 = open(all_filenames[i], 'r', encoding='utf-8-sig')
        t2 = open(all_filenames[i+1], 'r', encoding='utf-8-sig')
        fileone = t1.readlines()
        filetwo = t2.readlines()
        t1.close()
        t2.close()
        if i==0:
            d={t1.name:fileone[0]} 
        if fileone[0]!=filetwo[0]:
            d[t2.name]=filetwo[0]

print(d)
        

{'01-22-2020.csv': 'Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered\n', '03-01-2020.csv': 'Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude\n', '03-22-2020.csv': 'FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key\n', '05-29-2020.csv': 'FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incidence_Rate,Case-Fatality_Ratio\n', '11-09-2020.csv': 'FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio\n'}


#### dictionary d has all the dates as keys and columns as values where the column names have changed in filenames , so that we can see the change in the columns file name-wise

In [8]:
col_change=pd.Series(d)
col_change

01-22-2020.csv    Province/State,Country/Region,Last Update,Conf...
03-01-2020.csv    Province/State,Country/Region,Last Update,Conf...
03-22-2020.csv    FIPS,Admin2,Province_State,Country_Region,Last...
05-29-2020.csv    FIPS,Admin2,Province_State,Country_Region,Last...
11-09-2020.csv    FIPS,Admin2,Province_State,Country_Region,Last...
dtype: object

#### since the columns are changing , we need to standardize them and 'combined_key' column has 'city,state,country' values which are already present in other columns so removing it

In [9]:
def combined(f):
    df=pd.read_csv(f)
    df.rename(columns={'Province/State':'Province_State','Country/Region':'Country_Region',
                      'Last Update':'Last_Update','Lat':'Latitude','Long_':'Longitude',
                       'Case-Fatality_Ratio':'Case_Fatality_Ratio',
              'Incidence_Rate':'Incident_Rate'},inplace=True)
    if 'Combined_Key' in df.columns:
        df.drop('Combined_Key', axis=1, inplace=True)

    return df
    

#### set the order of columns 

In [10]:
combined_csv=pd.concat([combined(x) for x in all_filenames])
combined_csv=combined_csv[['FIPS','Country_Region','Province_State','Admin2','Latitude','Longitude',
                           'Last_Update','Confirmed','Deaths','Recovered','Active','Incident_Rate',
                           'Case_Fatality_Ratio']]

In [12]:
combined_csv.Last_Update.min()

'1/22/2020 17:00'

In [13]:
combined_csv.Last_Update=combined_csv.Last_Update.astype('datetime64[ns]')

In [14]:
combined_csv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 895701 entries, 0 to 3973
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   FIPS                 748915 non-null  float64       
 1   Country_Region       895701 non-null  object        
 2   Province_State       851048 non-null  object        
 3   Admin2               750308 non-null  object        
 4   Latitude             875565 non-null  float64       
 5   Longitude            875565 non-null  float64       
 6   Last_Update          895701 non-null  datetime64[ns]
 7   Confirmed            895682 non-null  float64       
 8   Deaths               895259 non-null  float64       
 9   Recovered            895310 non-null  float64       
 10  Active               887687 non-null  float64       
 11  Incident_Rate        659572 non-null  float64       
 12  Case_Fatality_Ratio  664525 non-null  float64       
dtypes: datetime64[ns

In [15]:
combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')

In [16]:
combined_csv.shape

(895701, 13)