In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
import statsmodels.api as sm
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import termplotlib as tpl
import os

### 1. Data preprocessing 

In [5]:
url = "https://raw.githubusercontent.com/aravind1338/506F20GroupProject/main/master.csv"
df = pd.read_csv(url, index_col=0)

In [7]:
df.shape

(27820, 11)

In [8]:
df.columns

Index(['year', 'sex', 'age', 'suicides_no', 'population', 'suicides/100k pop',
       'country-year', 'HDI for year', ' gdp_for_year ($) ',
       'gdp_per_capita ($)', 'generation'],
      dtype='object')

In [17]:
df.rename(columns = {"suicides/100k pop": "suicides_per_100k", "gdp_per_capita ($)": "gdp_per_capita"}, inplace=True)

In [18]:
df.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides_per_100k,country-year,HDI for year,gdp_for_year ($),gdp_per_capita,generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


In [11]:
df.age.drop_duplicates()

country
Albania    15-24 years
Albania    35-54 years
Albania      75+ years
Albania    25-34 years
Albania    55-74 years
Albania     5-14 years
Name: age, dtype: object

In [12]:
df.index.drop_duplicates()

Index(['Albania', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       ...
       'Thailand', 'Trinidad and Tobago', 'Turkey', 'Turkmenistan', 'Ukraine',
       'United Arab Emirates', 'United Kingdom', 'United States', 'Uruguay',
       'Uzbekistan'],
      dtype='object', name='country', length=101)

In [13]:
df.generation.drop_duplicates()

country
Albania       Generation X
Albania             Silent
Albania    G.I. Generation
Albania            Boomers
Albania         Millenials
Albania       Generation Z
Name: generation, dtype: object

In [16]:
df.reset_index(inplace = True)

### 2. Linear combination 

In [19]:
fml = "suicides_no ~ 0 + C(country) + C(sex) + C(age)"
m1 = sm.GLM.from_formula(fml, data=df, family=sm.families.Poisson())
r1 = m1.fit(scale="X2")
print(r1.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:            suicides_no   No. Observations:                27820
Model:                            GLM   Df Residuals:                    27713
Model Family:                 Poisson   Df Model:                          106
Link Function:                    log   Scale:                          52.071
Method:                          IRLS   Log-Likelihood:                -13611.
Date:                Wed, 11 Nov 2020   Deviance:                   1.2869e+06
Time:                        17:37:30   Pearson chi2:                 1.44e+06
No. Iterations:                    24                                         
Covariance Type:            nonrobust                                         
                                               coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------

In [21]:
gender = pd.get_dummies(df[['sex']])
gender

Unnamed: 0,sex_female,sex_male
0,0,1
1,0,1
2,1,0
3,0,1
4,0,1
...,...,...
27815,1,0
27816,1,0
27817,0,1
27818,1,0
