In [21]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score

In [3]:
data = pd.read_csv("../data/suicide.csv", index_col=0)

In [4]:
data.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,gdp_for_year,gdp_per_capita,generation,continent
1,Albania,1987,Male,15-24,21,312900.0,2156625000.0,796,Generation X,Europe
2,Albania,1987,Male,35-54,16,308000.0,2156625000.0,796,Silent,Europe
3,Albania,1987,Female,15-24,14,289700.0,2156625000.0,796,Generation X,Europe
4,Albania,1987,Male,75+,1,21800.0,2156625000.0,796,G.I. Generation,Europe
5,Albania,1987,Male,25-34,9,274300.0,2156625000.0,796,Boomers,Europe


In [5]:
data["generation"].unique()

array(['Generation X', 'Silent', 'G.I. Generation', 'Boomers',
       'Millenials', 'Generation Z'], dtype=object)

In [6]:
_gen_order = { "G.I. Generation": 1, "Silent": 2, "Boomers": 3, "Generation X": 4, "Millenials": 5, "Generation Z": 6 }
_age_order = { "5-14": 1, "15-24": 2, "25-34": 3, "35-54": 4, "55-74": 5, "75+": 6 }

def map_generation(gen):
    return _gen_order[gen]

def map_age(age):
    return _age_order[age]

In [161]:
# Preprocess the data

ml_data = data.copy()
ml_data.loc[:, "suicide_rate"] = (ml_data["suicides_no"] / ml_data["population"]) * 100000
ml_data["generation"] = ml_data["generation"].apply(map_generation)
ml_data["age"] = ml_data["age"].apply(map_age)

enc = OneHotEncoder()
encoded = enc.fit_transform(ml_data[["country", "continent", "sex"]])
encoded = pd.DataFrame.sparse.from_spmatrix(encoded, columns=enc.get_feature_names(["country", "continent", "sex"]))

ml_data = ml_data.drop(columns=["country", "continent", "sex", "suicides_no"]).reset_index(drop=True)
ml_data = pd.concat((ml_data, encoded), axis=1)


In [106]:
ml_data.head()

Unnamed: 0,year,age,population,gdp_for_year,gdp_per_capita,generation,suicide_rate,pop_inv,country_Albania,country_Antigua and Barbuda,...,country_United States,country_Uruguay,country_Uzbekistan,continent_Africa,continent_Americas,continent_Asia,continent_Europe,continent_Oceania,sex_Female,sex_Male
0,1987,2,312900.0,2156625000.0,796,4,6.711409,3e-06,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1987,4,308000.0,2156625000.0,796,2,5.194805,3e-06,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,1987,2,289700.0,2156625000.0,796,4,4.832585,3e-06,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,1987,6,21800.0,2156625000.0,796,1,4.587156,4.6e-05,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1987,3,274300.0,2156625000.0,796,3,3.281079,4e-06,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [107]:
X = ml_data.drop(columns="suicide_rate")
y = ml_data["suicide_rate"]

In [156]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

In [157]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [158]:
# Create the polynomial features
poly = PolynomialFeatures(degree=2)
X_train = poly.fit_transform(X_train)
X_test = poly.fit_transform(X_test)

In [159]:
lasso = Lasso(alpha=0.9)
lasso.fit(X_train, y_train)

Lasso(alpha=0.9)

In [160]:
lasso.score(X_test, y_test)

0.6215341618926791

In [115]:
coefs = pd.DataFrame(np.expand_dims(lasso.coef_, 1).T, columns=poly.get_feature_names(X.columns.to_numpy()))
coefs = coefs.T
coefs.columns = ["coef"]
coefs["coef_abs"] = abs(coefs["coef"])

In [116]:
coefs.sort_values(by="coef_abs", ascending=False).head(10)

Unnamed: 0,coef,coef_abs
sex_Female,-6.4889,6.4889
age,5.144366,5.144366
age sex_Female,-3.138619,3.138619
generation continent_Europe,-1.458457,1.458457
continent_Europe sex_Female,-1.235217,1.235217
country_Lithuania sex_Female,-0.985701,0.985701
generation,-0.936264,0.936264
age country_Republic of Korea,0.826474,0.826474
age continent_Europe,0.728462,0.728462
generation country_Hungary,-0.641669,0.641669
