In [28]:
# importamos las librerías que necesitamos

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np


# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Realizar los distintos test
# ------------------------------------------------------------------------------

from scipy.stats import ttest_ind, norm, chi2_contingency, f_oneway
from sklearn.linear_model import LinearRegression


# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [29]:
df_ab_testing = pd.read_csv('df_transformado_limpio.csv')
df_ab_testing.sample(5)

Unnamed: 0,employeenumber,age,attrition,businesstravel,dailyrate,distancefromhome,education,educationfield,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,remotework
501,3900,26,No,travel rarely,496.0,11,2,medical,1,Male,60,3,2,healthcare representative,1,married,48620.0,22722,1,No,13,3.0,3,1,5.0,3,3.0,5,3,3,1997,Yes
356,620,46,No,travel rarely,1211.0,5,4,life sciences,1,Male,98,3,2,sales executive,4,single,48620.0,20445,4,No,21,3.2,3,0,14.0,4,3.0,9,0,8,1977,Yes
476,19090,41,No,travel rarely,337.0,8,3,life sciences,3,Female,54,3,2,sales executive,2,married,48620.0,26841,5,No,21,4.0,3,1,14.0,3,3.0,5,1,4,1982,Yes
194,980,31,No,travel rarely,746.0,8,4,life sciences,3,Female,61,3,2,manufacturing director,4,single,44240.0,20682,1,No,23,4.0,4,0,11.0,2,3.0,11,1,8,1992,No
503,4220,58,No,travel rarely,390.0,1,4,life sciences,4,Male,32,1,2,healthcare representative,3,divorced,48620.0,17056,2,Yes,13,3.0,4,1,10.0,2,3.0,5,1,2,1965,Yes


In [4]:
# Crear la columna 'group' usando where 

df_ab_testing['group_AB_Testing'] = df_ab_testing['jobsatisfaction'].where(df_ab_testing['jobsatisfaction'] >= 3, 'A').where(df_ab_testing['jobsatisfaction'] < 3, 'B')

# Grupo A

In [6]:
# Filtrar el DataFrame para obtener solo los datos del grupo 'A'
grupo_a = df_ab_testing[df_ab_testing['group_AB_Testing'] == 'A']
grupo_a.head(1)

Unnamed: 0,employeenumber,age,attrition,businesstravel,dailyrate,distancefromhome,education,educationfield,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,remotework,group_AB_Testing
5,810,59,No,travel rarely,1435.0,25,3,life sciences,1,Female,99,3,3,sales executive,1,married,48620.0,2354,7,No,11,3.0,4,0,28.0,3,2.0,21,7,9,1964,Yes,A


In [9]:
a_convertido = round ((83 / 415), 2) # 2 porcentaje de rotación 
a_convertido

0.2

-------------------------

# Grupo B

In [7]:
# Filtrar el DataFrame para obtener solo los datos del grupo 'A'
grupo_b = df_ab_testing[df_ab_testing['group_AB_Testing'] == 'B']
grupo_b.head(1)

Unnamed: 0,employeenumber,age,attrition,businesstravel,dailyrate,distancefromhome,education,educationfield,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,remotework,group_AB_Testing
0,1620,51,No,travel rarely,684.0,6,3,life sciences,1,Male,51,3,5,research director,3,married,195370.0,6462,7,No,13,3.0,3,0,10.0,5,3.0,20,15,15,1972,Yes,B


In [8]:
b_convertido = round ((94 / 665), 2) # 2 porcentaje de rotación 
b_convertido

0.14

-------------------

# Test Z

In [44]:

# Crear una nueva columna 'Group'
df_ab_testing['Group'] = df_ab_testing['jobsatisfaction'].apply(lambda x: 'A' if x >= 3 else 'B')

# Calcular la tasa de rotación para cada grupo
attrition_counts = df_ab_testing.groupby('Group')['attrition'].value_counts().unstack().fillna(0)
print(attrition_counts, '\n')

# Número de empleados con rotación en cada grupo
count_A = attrition_counts.loc['A', 'Yes']
count_B = attrition_counts.loc['B', 'Yes']

# Número total de empleados en cada grupo
n_A = attrition_counts.loc['A'].sum()
n_B = attrition_counts.loc['B'].sum()

# Tasa de rotación en cada grupo
p_A = count_A / n_A
p_B = count_B / n_B
print('Rotación (A):', p_A, 'Rotación (B):', p_B, '\n')

# Tasa de rotación combinada
p_combined = (count_A + count_B) / (n_A + n_B)

# Error estándar de la diferencia de proporciones
se = np.sqrt(p_combined * (1 - p_combined) * (1/n_A + 1/n_B))

# Estadístico Z
z_stat = (p_A - p_B) / se

# p-value
p_value = 2 * (1 - norm.cdf(abs(z_stat)))

print(f"Z-Statistic: {z_stat}, P-Value: {p_value}" '\n')

# Interpretación de los resultados
if p_value < 0.05:
    print("Existe una diferencia significativa en la tasa de rotación entre los grupos A y B.")
else:
    print("No existe una diferencia significativa en la tasa de rotación entre los grupos A y B.")


attrition   No  Yes
Group              
A          571   94
B          332   83 

Rotación (A): 0.14135338345864662 Rotación (B): 0.2 

Z-Statistic: -2.5325568447279956, P-Value: 0.011323400810056006

Existe una diferencia significativa en la tasa de rotación entre los grupos A y B.
