# Background research_1

## Diabetes type 1 prevalence per country in 2022

In [1]:
import pandas as pd
import PyPDF2
import re
import psycopg2 as pg2

### Diabetes cases per country 2022 [[data source]](https://diabetesatlas.org/idfawp/resource-files/2022/12/IDF-T1D-Index-Report.pdf)

#### Extracting pdf table

In [3]:
# Open the file
f = open('IDF-T1D-Index-Report.pdf','rb')

In [4]:
# Define pdf reader
pdf_reader = PyPDF2.PdfFileReader(f)

Xref table not zero-indexed. ID numbers for objects will be corrected.


In [5]:
# Discover the number of pages in the document
pdf_reader.numPages

15

In [6]:
# We know that the actual table starts on page 10. Extract page 10
page_10 = pdf_reader.getPage(10) 

In [7]:
# Extract text from page 10
page_10_text = page_10.extractText()
page_10_text

"Type 1 diabetes estimates in children and adults – 2022 | www.diabetesatlas.org | 11\n0000001Country Age <20 years Age 20 - 59 years Age >=60 years Total\nAfghanistan5,487                                       4,370                                                120                                             9,964                                        \nAlbania 723                                           2,790                                                 898                                             4,406                                       \nAlgeria 46,642                                     103,731                                            13,423                                       163,767                                   \nAngola 3,946                                       8,074                                                207                                             12,237                                       \nAntigua and Barbuda 26                           

In [8]:
# Close the file 
f.close()

#### Preprocessing the extracted text

In [9]:
# The rows are divided by \n symbol
# Splitting the whole text to obtain the rows
# Removing first two rows with the title and column names
table = page_10_text.split('\n')[2:]
table

['Afghanistan5,487                                       4,370                                                120                                             9,964                                        ',
 'Albania 723                                           2,790                                                 898                                             4,406                                       ',
 'Algeria 46,642                                     103,731                                            13,423                                       163,767                                   ',
 'Angola 3,946                                       8,074                                                207                                             12,237                                       ',
 'Antigua and Barbuda 26                                              87                                                       13                                                126             

In [10]:
# A row example
table[190]

'United States 170,408                                 880,937                                           397,447                                    1,447,298                                '

In [11]:
# Using RE searching for the first accurances of non-digits
c = re.search(r'\D+',table[190]).group()
c

'United States '

In [12]:
# Removing spaces
re.findall(r'[^\s]+',c)

['United', 'States']

In [13]:
# Joining back with a space in between
' '.join(re.findall(r'[^\s]+',c))

'United States'

In [14]:
# Searching for digit patterns
n_numbers = re.findall(r'\d+,\d{3}|\d+',table[190])
n_numbers

['170,408', '880,937', '397,447', '1,447', '298']

In [15]:
# For cases where the total count is in millions
n_numbers = n_numbers[:3]+[''.join(n_numbers[3:])]
n_numbers

['170,408', '880,937', '397,447', '1,447298']

In [17]:
# Creating lists for future columns
country = []
age_less_than_20= []
age_20_59 = []
age_more_than_60 = []
total = []

In [18]:
# Iterating through the whole table
for row in table:
    
    country_name = re.search(r'\D+',row).group() #serching for a country name
    country.append(' '.join(re.findall(r'[^\s]+',country_name)))#creating a list of countries
    
    numbers = re.findall(r'\d+,\d{3}|\d+',row) #searching for numbers
    
    new_numbers = []
    for n in numbers:
        new_numbers.append(n.replace(',','')) #creating a list of numbers, removing commas
    if len(new_numbers)!=4: #for cases where the last number is in millions
        new_numbers = new_numbers[:3]+[''.join(new_numbers[3:])] #joining last two numbers in the list in one
        
    #creating lists with numeric values
    age_less_than_20.append(new_numbers[0])
    age_20_59.append(new_numbers[1])
    age_more_than_60.append(new_numbers[2])
    total.append(new_numbers[3])

In [20]:
# Checking if the lists have the same length
print(len(country),len(age_20_59),len(age_less_than_20),len(age_more_than_60))

201 201 201 201


#### Creating a DF

In [21]:
# Assembling the created lists into a DF
diabetes_cases = pd.DataFrame({'country':country,'age_less_than_20':age_less_than_20,
                               'age_20_59':age_20_59,'age_more_than_60':age_more_than_60,'total':total})
diabetes_cases

Unnamed: 0,country,age_less_than_20,age_20_59,age_more_than_60,total
0,Afghanistan,5487,4370,120,9964
1,Albania,723,2790,898,4406
2,Algeria,46642,103731,13423,163767
3,Angola,3946,8074,207,12237
4,Antigua and Barbuda,26,87,13,126
...,...,...,...,...,...
196,Viet Nam,4061,9897,839,14780
197,Western Sahara,7,45,1,54
198,Yemen,7109,8991,226,16313
199,Zambia,2279,4908,113,7299


In [22]:
diabetes_cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   country           201 non-null    object
 1   age_less_than_20  201 non-null    object
 2   age_20_59         201 non-null    object
 3   age_more_than_60  201 non-null    object
 4   total             201 non-null    object
dtypes: object(5)
memory usage: 8.0+ KB


In [23]:
# Converting values in the numeric columns into integers
for c in diabetes_cases.columns[1:]:
    diabetes_cases[c] = diabetes_cases[c].astype(int)

ValueError: invalid literal for int() with base 10: ''

In [25]:
# Looks like there is an empty cell > filling it in with 0
for i in diabetes_cases.index:
    for c in range(1,5):
        if diabetes_cases.iloc[i,c] == '':
            diabetes_cases.iloc[i,c] = 0

In [26]:
# Converting again
for c in diabetes_cases.columns[1:]:
    diabetes_cases[c] = diabetes_cases[c].astype(int)

In [27]:
diabetes_cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   country           201 non-null    object
 1   age_less_than_20  201 non-null    int64 
 2   age_20_59         201 non-null    int64 
 3   age_more_than_60  201 non-null    int64 
 4   total             201 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 8.0+ KB


### Population by country 2022 [[data source]](https://www.kaggle.com/datasets/whenamancodes/world-population-live-dataset?resource=download)

In [29]:
# Read the file
population = pd.read_csv('World Population Live Dataset.csv')
population

Unnamed: 0,CCA3,Name,2022,2020,2015,2010,2000,1990,1980,1970,Area (km²),Density (per km²),GrowthRate,World Population Percentage,Rank
0,CN,China,1425887,1424930,1393715,1348191,1264099,1153704,982372,822534,9706961,146.8933,1.00,17.88%,1
1,IN,India,1417173,1396387,1322867,1240614,1059634,870452,696828,557501,3287590,431.0675,1.01,17.77%,2
2,US,United States,338290,335942,324608,311183,282399,248084,223140,200328,9372610,36.0935,1.00,4.24%,3
3,ID,Indonesia,275501,271858,259092,244016,214072,182160,148177,115228,1904569,144.6529,1.01,3.45%,4
4,PK,Pakistan,235825,227197,210969,194454,154370,115414,80624,59291,881912,267.4018,1.02,2.96%,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,MS,Montserrat,4,5,5,5,5,11,11,11,102,43.0392,0.99,0.00%,230
230,FK,Falkland Islands,4,4,3,3,3,2,2,2,12173,0.3105,1.00,0.00%,231
231,NU,Niue,2,2,2,2,2,3,4,5,260,7.4385,1.00,0.00%,232
232,TK,Tokelau,2,2,1,1,2,2,2,2,12,155.9167,1.01,0.00%,233


In [30]:
# Creating a DF with data from 2022
population_2022 = pd.DataFrame({'country':population['Name'],'population':population['2022']*1000})

In [31]:
population_2022

Unnamed: 0,country,population
0,China,1425887000
1,India,1417173000
2,United States,338290000
3,Indonesia,275501000
4,Pakistan,235825000
...,...,...
229,Montserrat,4000
230,Falkland Islands,4000
231,Niue,2000
232,Tokelau,2000


In [32]:
population_2022.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234 entries, 0 to 233
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   country     234 non-null    object
 1   population  234 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 3.8+ KB


### Merging two DFs

In [33]:
# Left merge on 'country' column
diabetes_cases_and_population = diabetes_cases.merge(population_2022,how='left',on='country')
diabetes_cases_and_population

Unnamed: 0,country,age_less_than_20,age_20_59,age_more_than_60,total,population
0,Afghanistan,5487,4370,120,9964,41129000.0
1,Albania,723,2790,898,4406,2842000.0
2,Algeria,46642,103731,13423,163767,44903000.0
3,Angola,3946,8074,207,12237,35589000.0
4,Antigua and Barbuda,26,87,13,126,94000.0
...,...,...,...,...,...,...
196,Viet Nam,4061,9897,839,14780,
197,Western Sahara,7,45,1,54,576000.0
198,Yemen,7109,8991,226,16313,33697000.0
199,Zambia,2279,4908,113,7299,20018000.0


In [36]:
# Checking for NaNs
diabetes_cases_and_population[diabetes_cases_and_population['population'].isna()]

Unnamed: 0,country,age_less_than_20,age_20_59,age_more_than_60,total,population
24,Brunei Darussalam,19,70,9,98,
34,Channel Islands,95,667,201,962,
40,Cote d'Ivoire,2500,5147,164,7804,
43,Curaçao,2,5,2,9,
46,Democratic Republic of the Congo,5811,12283,423,18534,
58,Federated States of Micronesia,1,1,2,0,
105,Macao,51,309,85,445,
106,Macedonia,489,2137,569,3194,
146,Republic of Congo,557,1319,43,1917,
147,Réunion,84,300,32,415,


It looked like a few rows had missing values, which was probably due to a different spelling. Further along, each country was searched for in the population_2022 DF and if found the population data was transfered

In [37]:
for i in population_2022['country']:
    if 'Brunei' in i:
        print(population_2022[population_2022['country']==i])

    country  population
174  Brunei      449000


In [38]:
diabetes_cases_and_population.loc[24,'population'] = 449000

In [39]:
for i in population_2022['country']:
    if 'Islands' in i:
        print(population_2022[population_2022['country']==i])

             country  population
165  Solomon Islands      724000
                          country  population
199  United States Virgin Islands       99000
            country  population
204  Cayman Islands       69000
           country  population
208  Faroe Islands       53000
                      country  population
209  Northern Mariana Islands       50000
                      country  population
211  Turks and Caicos Islands       46000
              country  population
214  Marshall Islands       42000
                    country  population
220  British Virgin Islands       31000
          country  population
222  Cook Islands       17000
              country  population
230  Falkland Islands        4000


In [40]:
diabetes_cases_and_population.drop(index=34,inplace=True)

In [41]:
for i in population_2022['country']:
    if "d'Ivoire" in i:
        print(population_2022[population_2022['country']==i])

In [42]:
diabetes_cases_and_population.drop(index=40,inplace=True)

In [43]:
for i in population_2022['country']:
    if "Curacao" in i:
        print(population_2022[population_2022['country']==i])

     country  population
188  Curacao      191000


In [44]:
diabetes_cases_and_population.loc[43,'population'] = 191000

In [45]:
for i in population_2022['country']:
    if "Congo" in i:
        print(population_2022[population_2022['country']==i])

     country  population
14  DR Congo    99010000
                   country  population
113  Republic of the Congo     5970000


In [46]:
diabetes_cases_and_population.loc[146,'population'] = 5970000

In [47]:
diabetes_cases_and_population.loc[46,'population'] = 99010000

In [48]:
for i in population_2022['country']:
    if "Micronesia" in i:
        print(population_2022[population_2022['country']==i])

        country  population
193  Micronesia      114000


In [49]:
diabetes_cases_and_population.loc[58,'population'] = 114000

In [50]:
for i in population_2022['country']:
    if "Macau" in i:
        print(population_2022[population_2022['country']==i])

    country  population
166   Macau      695000


In [51]:
diabetes_cases_and_population.loc[105,'population'] = 695000

In [52]:
for i in population_2022['country']:
    if "Macedonia" in i:
        print(population_2022[population_2022['country']==i])

             country  population
149  North Macedonia     2094000


In [53]:
diabetes_cases_and_population.loc[106,'population'] = 2094000

In [54]:
for i in population_2022['country']:
    if "Reunion" in i:
        print(population_2022[population_2022['country']==i])

     country  population
160  Reunion      974000


In [55]:
diabetes_cases_and_population.loc[147,'population'] = 974000

In [56]:
for i in population_2022['country']:
    if "Lucia" in i:
        print(population_2022[population_2022['country']==i])

         country  population
189  Saint Lucia      180000


In [57]:
diabetes_cases_and_population.loc[168,'population'] = 180000

In [58]:
for i in population_2022['country']:
    if "Grenadines" in i:
        print(population_2022[population_2022['country']==i])

                              country  population
198  Saint Vincent and the Grenadines      104000


In [59]:
diabetes_cases_and_population.loc[169,'population'] = 104000

In [60]:
for i in population_2022['country']:
    if "Timor" in i:
        print(population_2022[population_2022['country']==i])

         country  population
154  Timor-Leste     1341000


In [61]:
diabetes_cases_and_population.loc[179,'population'] = 1341000

In [62]:
for i in population_2022['country']:
    if "Viet" in i:
        print(population_2022[population_2022['country']==i])

    country  population
15  Vietnam    98187000


In [63]:
diabetes_cases_and_population.loc[196,'population'] = 98187000

In [64]:
# Checking one more time for missing values
diabetes_cases_and_population[diabetes_cases_and_population['population'].isna()]

Unnamed: 0,country,age_less_than_20,age_20_59,age_more_than_60,total,population


In [65]:
diabetes_cases_and_population.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199 entries, 0 to 200
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   country           199 non-null    object 
 1   age_less_than_20  199 non-null    int64  
 2   age_20_59         199 non-null    int64  
 3   age_more_than_60  199 non-null    int64  
 4   total             199 non-null    int64  
 5   population        199 non-null    float64
dtypes: float64(1), int64(4), object(1)
memory usage: 19.0+ KB


In [66]:
# Converting values in'population' column to integers
diabetes_cases_and_population['population'] = diabetes_cases_and_population['population'].astype(int)

In [67]:
for i in diabetes_cases_and_population['country']:
    if "'" in i:
        print(diabetes_cases_and_population[diabetes_cases_and_population['country']==i])

          country  age_less_than_20  age_20_59  age_more_than_60  total  \
179  Timor L'Este                86        108                 4    199   

     population  
179     1341000  


In [68]:
diabetes_cases_and_population.loc[179,'country'] = 'Timor L-Este'

### Calculating prevalence

In [69]:
# Calculating prevalence by dividing the number of cases by population per country
cases = diabetes_cases_and_population['total']
population = diabetes_cases_and_population['population']
diabetes_cases_and_population['prevalence'] = round(cases/population,6)

In [70]:
diabetes_cases_and_population

Unnamed: 0,country,age_less_than_20,age_20_59,age_more_than_60,total,population,prevalence
0,Afghanistan,5487,4370,120,9964,41129000,0.000242
1,Albania,723,2790,898,4406,2842000,0.001550
2,Algeria,46642,103731,13423,163767,44903000,0.003647
3,Angola,3946,8074,207,12237,35589000,0.000344
4,Antigua and Barbuda,26,87,13,126,94000,0.001340
...,...,...,...,...,...,...,...
196,Viet Nam,4061,9897,839,14780,98187000,0.000151
197,Western Sahara,7,45,1,54,576000,0.000094
198,Yemen,7109,8991,226,16313,33697000,0.000484
199,Zambia,2279,4908,113,7299,20018000,0.000365


### Saving as CSV

In [71]:
diabetes_cases_and_population.to_csv('DiabetesCasesPerCountry.csv',index=False)

### Uploading to SQL server

In [98]:
#secret = 'password'

In [99]:
conn = pg2.connect(database='diabetes_app',user='postgres',password=secret,port='5434')

In [100]:
cursor = conn.cursor()

In [101]:
cursor.execute('DROP TABLE IF EXISTS diabetes_cases_and_population;')

In [102]:
create_query = 'CREATE TABLE diabetes_cases_and_population (country VARCHAR NOT NULL, total INT NOT NULL, population INT NOT NULL, prevalence DOUBLE PRECISION NOT NULL);'

In [103]:
cursor.execute(create_query)

In [104]:
conn.commit()

In [105]:
insert_query = 'INSERT INTO diabetes_cases_and_population VALUES'

In [106]:
num_cols = diabetes_cases_and_population.columns[-3:]
num_cols

Index(['total', 'population', 'prevalence'], dtype='object')

In [107]:
for i in diabetes_cases_and_population.index:
    insert_query = insert_query + '(' + "'" + diabetes_cases_and_population.loc[i,'country'] + "'" + ','
    for c in num_cols:
        insert_query = insert_query + str(diabetes_cases_and_population.loc[i,c]) + ','
    insert_query = insert_query[:-1] + '),'
    
insert_query = insert_query[:-1] + ';'

In [108]:
insert_query

"INSERT INTO diabetes_cases_and_population VALUES('Afghanistan',9964,41129000,0.000242),('Albania',4406,2842000,0.00155),('Algeria',163767,44903000,0.003647),('Angola',12237,35589000,0.000344),('Antigua and Barbuda',126,94000,0.00134),('Argentina',85529,45510000,0.001879),('Armenia',3649,2780000,0.001313),('Aruba',5,106000,4.7e-05),('Australia',126008,26177000,0.004814),('Austria',32475,8940000,0.003633),('Azerbaijan',9183,10358000,0.000887),('Bahamas',880,410000,0.002146),('Bahrain',2281,1472000,0.00155),('Bangladesh',24878,171186000,0.000145),('Barbados',624,282000,0.002213),('Belarus',16303,9535000,0.00171),('Belgium',42637,11656000,0.003658),('Belize',157,405000,0.000388),('Benin',3675,13353000,0.000275),('Bhutan',475,782000,0.000607),('Bolivia',2779,12224000,0.000227),('Bosnia and Herzegovina',6224,3234000,0.001925),('Botswana',946,2630000,0.00036),('Brazil',588800,215313000,0.002735),('Brunei Darussalam',98,449000,0.000218),('Bulgaria',23556,6782000,0.003473),('Burkina Faso',1146

In [109]:
cursor.execute(insert_query)

In [110]:
conn.commit()

In [111]:
conn.close()