In [328]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Step 1: Load data from Excel file
file_path = "Gold Mines Dataset Subsample CONFIDENTIAL 23 02 2024.xlsx"
df = pd.read_excel(file_path)

df.describe()
# df['prop_combined_info'] = df[['PROP_NAME','PRIMARY_COMMODITY', 'SNL_GLOBAL_REGION', 'COUNTRY_NAME', 'STATE_PROVINCE']].fillna('').agg('  '.join, axis=1)

# df['own_combined_info'] = df[['OWNER_NAME','PRIMARY_COMMODITY', 'SNL_GLOBAL_REGION', 'COUNTRY_NAME', 'STATE_PROVINCE']].fillna('').agg('  '.join, axis=1)

df['prop_combined_info'] = df[['PROP_NAME',]].fillna('').agg('  '.join, axis=1)

df['own_combined_info'] = df[['OWNER_NAME', ]].fillna('').agg('  '.join, axis=1)

# Step 2: Prepare data for mines and company-owners
mine_data = {
    "text": df['prop_combined_info'].tolist(),
    "label": [1] * len(df['PROP_NAME'])  # Label 1 for Mine
}

company_data = {
    "text": df['own_combined_info'].tolist(),
    "label": [2] * len(df['OWNER_NAME'])  # Label 2 for Company-Owner
}

# Step 3: Convert to DataFrames
mine_df = pd.DataFrame(mine_data)
company_df = pd.DataFrame(company_data)

# Combine both dataframes
formatted_df = pd.concat([mine_df, company_df], ignore_index=True)
formatted_df = formatted_df.dropna(subset=["text"])  # Drop rows with missing text

# Shuffle the data
formatted_df = formatted_df.sample(frac=1, random_state=45).reset_index(drop=True)

# Step 4: Convert text to embeddings using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=15000, stop_words="english", ngram_range=(1, 5))
X = vectorizer.fit_transform(formatted_df["text"])
y = formatted_df["label"]

# Save the vectorizer
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

# Step 5: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=45)

# Step 6: Define models
models = {
    "Random Forest": RandomForestClassifier(
        n_estimators=1000, max_depth=50, min_samples_split=2
    ),
    "Logistic Regression": LogisticRegression(
        C=10.0, solver='lbfgs', max_iter=1000, random_state=42
    ),
    "Gradient Boosting": GradientBoostingClassifier(
        n_estimators=100, learning_rate=0.01, max_depth=50, random_state=42
    ),
    "Support Vector Machine": SVC(
        C=10.0, kernel='rbf', probability=True, random_state=42
    ),
}

# Step 7: Train and save models
for model_name, model in models.items():
    model.fit(X_train, y_train)
    joblib.dump(model, f"{model_name.replace(' ', '_')}_model.pkl")  # Save each model to a file

# Step 8: Load models and vectorizer
loaded_vectorizer = joblib.load("tfidf_vectorizer.pkl")
loaded_models = {
    model_name: joblib.load(f"{model_name.replace(' ', '_')}_model.pkl") for model_name in models.keys()
}


In [341]:

# Step 9: Predict on new texts
new_texts = [
    "AuVert Mining Group Limited",  # Example text
]
new_texts_embedded = loaded_vectorizer.transform(new_texts)

# Set a confidence threshold
threshold = 0.6

# Create DataFrame for predictions
predictions_data = []
for model_name, model in loaded_models.items():
    new_predictions = model.predict(new_texts_embedded)
    new_probabilities = model.predict_proba(new_texts_embedded) if hasattr(model, "predict_proba") else None
    for i, text in enumerate(new_texts):
        pred = new_predictions[i]
        prob = max(new_probabilities[i]) if new_probabilities is not None else None  # Probability of the predicted class
        
        # Check if the probability is above the threshold
        if prob is not None and prob < threshold:
            prediction = "Uncertain"
        else:
            prediction = "Mine" if pred == 1 else "Company"
        
        predictions_data.append({
            "Model": model_name,
            "Text": text,
            "Prediction": prediction,
            "Probability": f"{prob:.5f}" if prob is not None else "N/A"
        })

# Step 10: Convert predictions to DataFrame
predictions_df = pd.DataFrame(predictions_data)

# Save predictions to CSV
predictions_df.to_csv("predictions_for_new_texts.csv", index=False)

# Display predictions
predictions_df

Unnamed: 0,Model,Text,Prediction,Probability
0,Random Forest,AuVert Mining Group Limited,Company,0.9441
1,Logistic Regression,AuVert Mining Group Limited,Company,0.97065
2,Gradient Boosting,AuVert Mining Group Limited,Company,0.81818
3,Support Vector Machine,AuVert Mining Group Limited,Company,1.0
