1. Predicting Employee Attrition Using Logistic Regression
   Dataset: HR Analytics Employee Attrition Dataset
   Preprocessing Steps:
     - Handle missing values if any.
     - Encode categorical variables (e.g., one-hot encoding for department, gender, etc.).
     - Standardize numerical features.
   Task: Implement logistic regression to predict employee attrition and evaluate the model using precision, recall, and F1-score.


In [21]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

# Load the dataset
file_path = '/content/WA_Fn-UseC_-HR-Employee-Attrition (1).csv'
df = pd.read_csv(file_path)

# Encode the target variable
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})

# Select categorical features for encoding
categorical_features = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']

# Apply one-hot encoding to categorical features
df_encoded = pd.get_dummies(df, columns=categorical_features)

# Select numerical features for standardization
numerical_features = ['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction',
                      'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
                      'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating',
                      'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears',
                      'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
                      'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

# Standardize numerical features
scaler = StandardScaler()
df_encoded[numerical_features] = scaler.fit_transform(df_encoded[numerical_features])

# Check the data types of all columns
print(df_encoded.dtypes)

# Ensure there are no non-numerical columns
non_numerical_columns = df_encoded.select_dtypes(include=['object']).columns
print("Non-numerical columns:", non_numerical_columns)

# If there are non-numerical columns, display unique values to understand what needs to be converted
for col in non_numerical_columns:
    print(f"Unique values in {col}: {df_encoded[col].unique()}")

# Assuming 'Over18' column is one of the problematic columns based on the previous traceback, let's inspect it
if 'Over18' in df_encoded.columns:
    print("Unique values in Over18 column:", df_encoded['Over18'].unique())

# Drop the 'Over18' column if it's not relevant or convert it to numerical if needed
df_encoded = df_encoded.drop(columns=['Over18'], errors='ignore')

# Convert any other necessary columns to numerical or drop them if they are irrelevant
# In this example, 'EmployeeCount', 'EmployeeNumber', 'StandardHours', and 'Over18' are irrelevant and can be dropped
columns_to_drop = ['EmployeeCount', 'EmployeeNumber', 'StandardHours', 'Over18']
df_encoded = df_encoded.drop(columns=columns_to_drop, errors='ignore')

# Check data types again to ensure all are numerical
print(df_encoded.dtypes)

# Split the data into features and target variable
X = df_encoded.drop(columns=['Attrition'])
y = df_encoded['Attrition']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", report)


Age                                  float64
Attrition                              int64
DailyRate                            float64
DistanceFromHome                     float64
Education                            float64
EmployeeCount                          int64
EmployeeNumber                         int64
EnvironmentSatisfaction              float64
HourlyRate                           float64
JobInvolvement                       float64
JobLevel                             float64
JobSatisfaction                      float64
MonthlyIncome                        float64
MonthlyRate                          float64
NumCompaniesWorked                   float64
Over18                                object
PercentSalaryHike                    float64
PerformanceRating                    float64
RelationshipSatisfaction             float64
StandardHours                          int64
StockOptionLevel                     float64
TotalWorkingYears                    float64
TrainingTi