In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
import numpy as np

# Load dataset
data = load_iris()
X, y = data.data, data.target

# Contoh K-Fold Cross-Validation
model = RandomForestClassifier()
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

print(f"Accuracy: {np.mean(scores):.2f} (+/- {np.std(scores):.2f})")


Accuracy: 0.97 (+/- 0.02)


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

# Load dataset
iris = load_iris()
X, y = iris.data, iris.target

# Bagi dataset menjadi training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inisialisasi model dan training
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Prediksi
y_pred = model.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)


[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


In [5]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# Load dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definisikan model dan parameter grid
model = RandomForestClassifier()
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}

# Grid Search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Best Parameters: {'max_depth': None, 'n_estimators': 200}
Best Score: 0.95


In [6]:
from sklearn.model_selection import RandomizedSearchCV

# Definisikan model dan parameter distribusi
model = RandomForestClassifier()
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'bootstrap': [True, False]
}

# Random Search
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy')
random_search.fit(X_train, y_train)

print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Best Parameters: {'n_estimators': 50, 'max_depth': None, 'bootstrap': True}
Best Score: 0.95


In [11]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv('spam_dataset_id.csv')

# Pastikan nama kolom benar
print(df.columns)

# Menggunakan nama kolom yang benar
X = df['email_text']  # Kolom teks email
y = df['is_spam']  # Kolom label spam (1 = spam, 0 = bukan spam)

# Split data menjadi training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline untuk transformasi data dan model Naive Bayes
model = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

# Training model
model.fit(X_train, y_train)

# Prediksi pada data testing
y_pred = model.predict(X_test)

# Evaluasi akurasi
accuracy = accuracy_score(y_test, y_pred)
print(f'Akurasi Model: {accuracy:.2f}')


Index(['email_text', 'word_freq_gratis', 'word_freq_menang', 'word_freq_uang',
       'word_freq_diskon', 'is_spam'],
      dtype='object')
Akurasi Model: 0.50


In [14]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# STUDI CASE 1

# Load dataset
df = pd.read_csv('spam_dataset_id.csv')

# Pastikan nama kolom sesuai dengan dataset
print(df.columns)

# Menggunakan nama kolom yang benar
X = df['email_text']  # Kolom teks email
y = df['is_spam']  # Kolom label spam (1 = spam, 0 = bukan spam)

# Split data menjadi training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline untuk transformasi data dan model Decision Tree
model = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Training model
model.fit(X_train, y_train)

# Prediksi pada data testing
y_pred = model.predict(X_test)

# Evaluasi akurasi
accuracy = accuracy_score(y_test, y_pred)
print(f'Akurasi Model Decision Tree: {accuracy:.2f}')
print(classification_report(y_test, y_pred))


Index(['email_text', 'word_freq_gratis', 'word_freq_menang', 'word_freq_uang',
       'word_freq_diskon', 'is_spam'],
      dtype='object')
Akurasi Model Decision Tree: 1.00
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# STUDI CASE 2

# Load dataset
df = pd.read_csv("rumah.csv")  # Pastikan file CSV sudah tersedia

# Pisahkan fitur (X) dan target (y)
X = df.drop(columns=["harga"])
y = df["harga"]

# Split data untuk training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter tuning menggunakan Random Search
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

# Model Random Forest
rf = RandomForestRegressor(random_state=42)
rf_random = RandomizedSearchCV(rf, param_grid, n_iter=10, cv=3, random_state=42, n_jobs=-1)
rf_random.fit(X_train, y_train)

# Model Gradient Boosting
gb = GradientBoostingRegressor(random_state=42)
gb_random = RandomizedSearchCV(gb, param_grid, n_iter=10, cv=3, random_state=42, n_jobs=-1)
gb_random.fit(X_train, y_train)

# Evaluasi model
models = {"Random Forest": rf_random.best_estimator_, "Gradient Boosting": gb_random.best_estimator_}

for name, model in models.items():
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    print(f"Model: {name}")
    print(f"MAE: {mae}")
    print(f"RMSE: {rmse}\n")


Model: Random Forest
MAE: 77833333.33333333
RMSE: 90875143.2827848

Model: Gradient Boosting
MAE: 62645917.8541468
RMSE: 86716723.75249463

