In [1]:
#importing required libraries

import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [2]:
#Loading data into dataframe

data = pd.read_csv("master.csv")
data.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


In [3]:
#Renaming the columns names for convinience

data.columns = ['country', 'year', 'gender', 'age_group', 'suicide_count', 
                'population', 'suicide_rate', 'country-year', 'HDI for year',
                'gdp_for_year', 'gdp_per_capita', 'generation']
data.columns

Index(['country', 'year', 'gender', 'age_group', 'suicide_count', 'population',
       'suicide_rate', 'country-year', 'HDI for year', 'gdp_for_year',
       'gdp_per_capita', 'generation'],
      dtype='object')

In [4]:
#Information about the dataset

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27820 entries, 0 to 27819
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   country         27820 non-null  object 
 1   year            27820 non-null  int64  
 2   gender          27820 non-null  object 
 3   age_group       27820 non-null  object 
 4   suicide_count   27820 non-null  int64  
 5   population      27820 non-null  int64  
 6   suicide_rate    27820 non-null  float64
 7   country-year    27820 non-null  object 
 8   HDI for year    8364 non-null   float64
 9   gdp_for_year    27820 non-null  object 
 10  gdp_per_capita  27820 non-null  int64  
 11  generation      27820 non-null  object 
dtypes: float64(2), int64(4), object(6)
memory usage: 2.5+ MB


In [5]:
data.describe()

Unnamed: 0,year,suicide_count,population,suicide_rate,HDI for year,gdp_per_capita
count,27820.0,27820.0,27820.0,27820.0,8364.0,27820.0
mean,2001.258375,242.574407,1844794.0,12.816097,0.776601,16866.464414
std,8.469055,902.047917,3911779.0,18.961511,0.093367,18887.576472
min,1985.0,0.0,278.0,0.0,0.483,251.0
25%,1995.0,3.0,97498.5,0.92,0.713,3447.0
50%,2002.0,25.0,430150.0,5.99,0.779,9372.0
75%,2008.0,131.0,1486143.0,16.62,0.855,24874.0
max,2016.0,22338.0,43805210.0,224.97,0.944,126352.0


In [6]:
#checking the data for null or missing values

data.isnull().sum()

country               0
year                  0
gender                0
age_group             0
suicide_count         0
population            0
suicide_rate          0
country-year          0
HDI for year      19456
gdp_for_year          0
gdp_per_capita        0
generation            0
dtype: int64

In [7]:
#dropping the HDI for year column

data = data.drop(['HDI for year'], axis = 1)
data.shape

(27820, 11)

In [8]:
#dropping the country-year for year column

data = data.drop(['country-year'], axis = 1)
data.shape

(27820, 10)

In [9]:
#dropping off any null rows (is any)

data = data.dropna()
data.shape

(27820, 10)

In [10]:
#encoding the categorical features with LabelEncoder

from sklearn.preprocessing import LabelEncoder
categorical = ['country', 'year','age_group', 'gender', 'generation']
le = sklearn.preprocessing.LabelEncoder()

for column in categorical:
    data[column] = le.fit_transform(data[column])

In [11]:
#creating a copy of dataset for statistical test

stat_data = data.copy()
stat_data

Unnamed: 0,country,year,gender,age_group,suicide_count,population,suicide_rate,gdp_for_year,gdp_per_capita,generation
0,0,2,1,0,21,312900,6.71,2156624900,796,2
1,0,2,1,2,16,308000,5.19,2156624900,796,5
2,0,2,0,0,14,289700,4.83,2156624900,796,2
3,0,2,1,5,1,21800,4.59,2156624900,796,1
4,0,2,1,1,9,274300,3.28,2156624900,796,0
...,...,...,...,...,...,...,...,...,...,...
27815,100,29,0,2,107,3620833,2.96,63067077179,2309,2
27816,100,29,0,5,9,348465,2.58,63067077179,2309,5
27817,100,29,1,3,60,2762158,2.17,63067077179,2309,3
27818,100,29,0,3,44,2631600,1.67,63067077179,2309,3


In [12]:
#improting required libraries
from scipy import stats

In [13]:
#collecting male suicide rate data
male = stat_data['suicide_rate'][stat_data['gender'] == 1]
male

0         6.71
1         5.19
3         4.59
4         3.28
8         0.73
         ...  
27809    11.61
27811    11.33
27812    11.10
27813     7.56
27817     2.17
Name: suicide_rate, Length: 13910, dtype: float64

In [14]:
#collecting female suicide rate data
female = stat_data['suicide_rate'][stat_data['gender'] == 0]
female

2        4.83
5        2.81
6        2.15
7        1.56
9        0.00
         ... 
27814    5.92
27815    2.96
27816    2.58
27818    1.67
27819    1.46
Name: suicide_rate, Length: 13910, dtype: float64

In [15]:
#calculating p value
ttest,pval = stats.ttest_rel(male, female)

if pval<0.05:
    print("Reject null hypothesis")
else:
    print("Accept null hypothesis")
    
ttest,pval

Reject null hypothesis


(97.47788487875155, 0.0)

In [16]:
#Creating Contingency Table
contingency_table = pd.crosstab(stat_data.suicide_rate, stat_data.age_group)

In [17]:
#Significance Level 5%
alpha=0.05

In [18]:
chistat, p, dof, expected = stats.chi2_contingency(contingency_table )

In [19]:
#critical_value
critical_value=stats.chi2.ppf(q=1-alpha,df=dof)
print('critical_value:',critical_value)

critical_value: 26864.700169422224


In [20]:
print('Significance level: ',alpha)
print('Degree of Freedom: ',dof)
print('chi-square statistic:',chistat)
print('critical_value:',critical_value)
print('p-value:',p) 
#Here, pvalue = 0.0 and a low pvalue suggests that your sample provides enough evidence that you can reject  H0  for the entire population.

Significance level:  0.05
Degree of Freedom:  26485
chi-square statistic: 39571.48433519147
critical_value: 26864.700169422224
p-value: 0.0


In [21]:
#compare chi_square_statistic with critical_value and p-value which is the 
 #probability of getting chi-square>0.09 (chi_square_statistic)
if chistat>=critical_value:
    print("Reject H0,There is a dependency between Age group & Suicide rate.")
else:
    print("Retain H0,There is no relationship between Age group & Suicide rate.")
    
if p<=alpha:
    print("Reject H0,There is a dependency between Age group & Suicide rate.")
else:
    print("Retain H0,There is no relationship between Age group & Suicide rate.")

Reject H0,There is a dependency between Age group & Suicide rate.
Reject H0,There is a dependency between Age group & Suicide rate.


In [22]:
#Creating Contingency Table
contingency_table2 = pd.crosstab(stat_data.suicide_rate, stat_data.generation)

In [23]:
#Significance Level 5%
alpha=0.05

In [24]:
chistat, p, dof, expected = stats.chi2_contingency(contingency_table2 )

In [25]:
#critical_value
critical_value=stats.chi2.ppf(q=1-alpha,df=dof)
print('critical_value:',critical_value)

critical_value: 26864.700169422224


In [26]:
print('Significance level: ',alpha)
print('Degree of Freedom: ',dof)
print('chi-square statistic:',chistat)
print('critical_value:',critical_value)
print('p-value:',p) 

Significance level:  0.05
Degree of Freedom:  26485
chi-square statistic: 31584.46289376714
critical_value: 26864.700169422224
p-value: 4.174669995610524e-97


In [27]:
#compare chi_square_statistic with critical_value and p-value which is the 
 #probability of getting chi-square>0.09 (chi_square_statistic)
if chistat>=critical_value:
    print("Reject H0,There is a dependency between Generation & Suicide rate.")
else:
    print("Retain H0,There is no relationship between Generation & Suicide rate.")
    
if p<=alpha:
    print("Reject H0,There is a dependency between Generation & Suicide rate.")
else:
    print("Retain H0,There is no relationship between Generation & Suicide rate.")

Reject H0,There is a dependency between Generation & Suicide rate.
Reject H0,There is a dependency between Generation & Suicide rate.
