In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# Memuat dataset dari file lokal
file_path = 'HR_comma_sep.csv'  # Ganti dengan path ke file lokal jika diperlukan
data = pd.read_csv(file_path)

# Melihat sekilas data
print("Sekilas data:")
print(data.head(), '\n')

# Melihat informasi umum dari dataset
print("Informasi dataset:")
print(data.info(), '\n')

# Statistik deskriptif
print("Statistik deskriptif:")
print(data.describe(), '\n')

# Mengecek distribusi kelas
print("Distribusi kelas 'left':")
print(data['left'].value_counts(), '\n')

# Pra-pemrosesan data
# Mengubah fitur kategorikal menjadi dummy variables
data = pd.get_dummies(data)

# Memisahkan fitur dan target
X = data.drop('left', axis=1)
y = data['left']

# Membagi data menjadi data train dan test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Normalisasi fitur numerik
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Membangun model Decision Tree
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Prediksi pada data test
y_pred = model.predict(X_test)

# Evaluasi model
print("Evaluasi model:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Contoh prediksi karyawan baru
# Data karyawan baru (sebagai contoh)
new_employee = pd.DataFrame({
    'satisfaction_level': [0.5],
    'last_evaluation': [0.8],
    'number_project': [3],
    'average_montly_hours': [150],
    'time_spend_company': [3],
    'Work_accident': [0],
    'promotion_last_5years': [0],
    'sales': ['sales'],
    'salary': ['low']
})

# Pra-pemrosesan data baru
new_employee = pd.get_dummies(new_employee)
new_employee = new_employee.reindex(columns=X.columns, fill_value=0)
new_employee = scaler.transform(new_employee)

# Prediksi
new_pred = model.predict(new_employee)
print("Prediksi churn karyawan baru:", new_pred)


Sekilas data:
   EmpID  Age AgeGroup Attrition     BusinessTravel  DailyRate  \
0  RM297   18    18-25       Yes      Travel_Rarely        230   
1  RM302   18    18-25        No      Travel_Rarely        812   
2  RM458   18    18-25       Yes  Travel_Frequently       1306   
3  RM728   18    18-25        No         Non-Travel        287   
4  RM829   18    18-25       Yes         Non-Travel        247   

               Department  DistanceFromHome  Education EducationField  ...  \
0  Research & Development                 3          3  Life Sciences  ...   
1                   Sales                10          3        Medical  ...   
2                   Sales                 5          3      Marketing  ...   
3  Research & Development                 5          2  Life Sciences  ...   
4  Research & Development                 8          1        Medical  ...   

   RelationshipSatisfaction  StandardHours  StockOptionLevel  \
0                         3             80              

KeyError: 'left'