In [2]:
# Chargement des bibliothèques utilisées.
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [3]:
# Répertoire où sont les données issues de l'enquête emploi 2012.

workdir =  '/Users/michalurdanivia/Google Drive/Enseignement/UGA/Econ_S2_UGA_M1_MIASH/Donnees/eemploi2012/'

# Lecture des données.
"""
Remarque.
Les données sont un extrait de l'enquête emploi 2012 construit par le script "eemploi2012_s0.R".

"""
eedata = pd.read_csv(workdir+'eemploi2012_s0.csv', low_memory = False ) 
eedata.shape # dimensions du fichiers: nombre de lignes/observations x nombre de colonnes/variables.

(31045, 801)

In [3]:
# Sélection d'un échantillon pour travailler.

"""
Remarque.
Pour comprendre les différentes variables vous devez vous reporter aux dictionnaire
correspondant à l'enquête emploi 2012, et au script R utilisé pour construire plusieurs 
d'entre elles.

"""

## On considère les individus en emploi.
eedata = eedata[['DDIPL1' , 'DDIPL3' , 'DDIPL4' , 'DDIPL5' , 'DDIPL6', 'DDIPL7','SEXE2', 'SEXE1', \
                 'ANCENTR41' , 'ANCENTR42', 'ANCENTR43', 'ANCENTR44', 'AG', \
                 'lsalhor' ]][eedata.ACTEU1 == 1]
eedata.shape
eedata.head() # Premières lignes.
eedata.describe() # Quelques statistiques descriptives.

Unnamed: 0,DDIPL1,DDIPL3,DDIPL4,DDIPL5,DDIPL6,DDIPL7,SEXE2,SEXE1,ANCENTR41,ANCENTR42,ANCENTR43,ANCENTR44,AG,lsalhor
count,19764.0,19764.0,19764.0,19764.0,19764.0,19764.0,19764.0,19764.0,19764.0,19764.0,19764.0,19764.0,19764.0,19764.0
mean,0.141571,0.159482,0.194191,0.281826,0.071241,0.15169,0.510018,0.489982,0.116879,0.23725,0.177444,0.468427,41.31527,2.410269
std,0.348618,0.366134,0.395587,0.4499,0.257233,0.358729,0.499912,0.499912,0.321284,0.425407,0.382054,0.499015,10.460641,0.398223
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.026145
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,2.15284
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,42.0,2.352846
75%,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,50.0,2.607141
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,59.0,7.833996


In [4]:
# Regression linéaire.
"""
Nous allons commencer par la régression linéaire du log du salaire sur les indicatrices du diplôme, 
du sexe, de l'experience, et de l'ancienneté.
"""
eedata['AG2'] = np.log(eedata.AG) # Expérience au carré.

Y    = eedata['lsalhor'] # Régressande.
X    = eedata[['DDIPL1' , 'DDIPL3' , 'DDIPL4' , 'DDIPL5' , 'DDIPL6', 'SEXE2', \
              'ANCENTR42', 'ANCENTR43', 'ANCENTR44', 'AG', 'AG2']] # Régresseurs.
X = sm.add_constant(X) # On ajoute une constante.

# Estimation de la régression par MCO.
# Matrice de variances-convariances robuste à l'hétéroscédasticité de White(1980).
lin_reg_ols = sm.OLS(Y, X).fit(cov_type='HC0')
print ('------------------------------------------------------------------------------')
print ('- Régression linéaire pour le log du salaire                                  ')
print ('------------------------------------------------------------------------------')
print ('')
print(lin_reg_ols.summary())

------------------------------------------------------------------------------
- Régression linéaire pour le log du salaire                                  
------------------------------------------------------------------------------

                            OLS Regression Results                            
Dep. Variable:                lsalhor   R-squared:                       0.318
Model:                            OLS   Adj. R-squared:                  0.318
Method:                 Least Squares   F-statistic:                     691.9
Date:                Wed, 18 Jan 2017   Prob (F-statistic):               0.00
Time:                        16:55:59   Log-Likelihood:                -6058.0
No. Observations:               19764   AIC:                         1.214e+04
Df Residuals:                   19752   BIC:                         1.223e+04
Df Model:                          11                                         
Covariance Type:                  HC0              