In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Step 1: Load the dataset
df = pd.read_csv("cervical cancer.csv")

# Step 2: Preprocessing the data

# Convert non-numeric values to NaN
df.replace('?', pd.NA, inplace=True)

# Convert columns to numeric (ignore errors for non-numeric columns)
df = df.apply(pd.to_numeric, errors='ignore')

# Identify features and target
X = df.drop('Schiller', axis=1)  # Replace 'TargetColumn' with the actual column name of your target
y = df['Hinselmann']

# Handle missing values using KNN imputer
imputer = KNNImputer(n_neighbors=5)
X_imputed = imputer.fit_transform(X)

# Encode categorical variables if necessary
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Feature scaling (optional)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Step 3: Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

# Step 4: Build the Random Forest Classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Step 5: Predict and Evaluate the model
y_pred = classifier.predict(X_test)

# Performance metrics
print("Accuracy:", accuracy_score(y_test, y_pred))

# Convert label_encoder classes to a list of strings
target_names = [str(cls) for cls in label_encoder.classes_]
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=target_names))
