In [0]:
import pandas as pd
from sklearn import preprocessing
from sklearn import linear_model 
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import explained_variance_score as EVS

**Imports data and drops outliers**

In [0]:
d = pd.read_csv('masterlist.csv')
d= d.drop(['country-year', ' gdp_for_year ($) ', 'suicides/100k pop'], axis=1)

d = d[d['suicides_no'] < 10000]
d = d[d['gdp_per_capita ($)'] < 100000]


**Finds categorical features**

In [0]:
def cat_features(dataframe):
    td = pd.DataFrame({'a':[1,2,3], 'b':[1.0, 2.0, 3.0]})
    return filter(lambda x: not(dataframe[x].dtype in [td['a'].dtype, td['b'].dtype]), list(dataframe))

C = list(cat_features(d))

**One-Hot Encodes and splits X and Y data**

In [0]:
df = pd.get_dummies(d, columns=['country', 'generation','age', 'sex'], drop_first=True)
df_x = df.loc[:, df.columns != 'suicides_no']
df_y = df['suicides_no']


**Creates Regression Model**

In [0]:
model = linear_model.LinearRegression()
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.2, random_state = 4)


**Preprocesses and Imputes Missing Data and Scales it**

In [6]:
imp = preprocessing.Imputer(missing_values = 'NaN', strategy = 'mean', axis=0)
scaler = preprocessing.MinMaxScaler()
df_x1 = imp.fit_transform(df_x)
df_x1 = scaler.fit_transform(df_x1)
train_x_pp = imp.fit_transform(x_train)
train_x_pp = scaler.fit_transform(train_x_pp)

test_x_pp = imp.transform(x_test)
test_x_pp = scaler.transform(test_x_pp)



**Fits and Predicts Model**

In [0]:
model.fit(train_x_pp, y_train)
predict = model.predict(test_x_pp)

**Shows Error Metrics**

In [8]:
print('R2 Base: ' + str(r2_score(y_test, predict)))
print('Mean Squared Error: ' + str(MSE(y_test, predict)))
print('Explained Variance Score: ' + str(EVS(y_test, predict)))


R2 Base: 0.5813491911500984
Mean Squared Error: 173822.11194707625
Explained Variance Score: 0.5816333805522629


**Gets Model Coefficients**

In [9]:
coef = model.coef_

x2 = sm.add_constant(test_x_pp)
est = sm.OLS(y_test, x2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:            suicides_no   R-squared:                       0.597
Model:                            OLS   Adj. R-squared:                  0.589
Method:                 Least Squares   F-statistic:                     69.77
Date:                Wed, 01 May 2019   Prob (F-statistic):               0.00
Time:                        18:21:37   Log-Likelihood:                -41041.
No. Observations:                5522   AIC:                         8.231e+04
Df Residuals:                    5406   BIC:                         8.308e+04
Df Model:                         115                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -148.5511     74.365     -1.998      0.0