<a href="https://colab.research.google.com/github/yeho/scikit-learn-AI/blob/master/Deteccion_de_spam__NB_SVM_RF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import matplotlib.pyplot as plt
import lime
import lime.lime_text
import re # Import regular expressions

# Set the matplotlib backend to 'agg' to avoid display issues
plt.switch_backend('agg')

try:
    # Load data from local CSV files
    train_df = pd.read_csv('/content/sample_data/train.csv')
    test_df = pd.read_csv('/content/sample_data/test.csv')

    print("Primeras 5 filas del conjunto de entrenamiento:")
    display(train_df.head())

    print("\nPrimeras 5 filas del conjunto de prueba:")
    display(test_df.head())

except FileNotFoundError:
    print("Asegúrate de que los archivos train.csv y test.csv estén en el directorio /content/sample_data/.")
except Exception as e:
    print(f"Ocurrió un error al cargar los archivos: {e}")

# The rest of the code for preprocessing, model training, and evaluation
# will use train_df and test_df, which are now loaded from local files.
# This part of the code remains the same as it operates on the dataframes,
# regardless of how they were loaded initially.

# Check for missing values (already in original code, keeping it)
print("\nMissing values in training data:")
print(train_df.isnull().sum())
print("\nMissing values in testing data:")
print(test_df.isnull().sum())

# Separate features and labels (already in original code, keeping it)
X_train = train_df['text']
y_train = train_df['label']
X_test = test_df['text']
y_test = test_df['label']

# Display the shapes of the resulting series (already in original code, keeping it)
print("\nShape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

# Instantiate CountVectorizer (already in original code, keeping it)
vectorizer = CountVectorizer()

# Fit and transform X_train (already in original code, keeping it)
X_train_vec = vectorizer.fit_transform(X_train)

# Transform X_test using the fitted vectorizer (already in original code, keeping it)
X_test_vec = vectorizer.transform(X_test)

# Instantiate and train MultinomialNB model (already in original code, keeping it)
model_nb = MultinomialNB()
model_nb.fit(X_train_vec, y_train)
print("Multinomial Naive Bayes model training complete.")

# Instantiate and train SVM model (added in a previous step, keeping it)
model_svm = SVC(probability=True) # Added probability=True for LIME
model_svm.fit(X_train_vec, y_train)
print("SVM model training complete.")

# Instantiate and train Random Forest model (added in a previous step, keeping it)
model_rf = RandomForestClassifier()
model_rf.fit(X_train_vec, y_train)
print("Random Forest model training complete.")

# Evaluate Multinomial Naive Bayes model (already in original code, keeping it)
y_pred_nb = model_nb.predict(X_test_vec)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb, pos_label='spam')
recall_nb = recall_score(y_test, y_pred_nb, pos_label='spam')
f1_nb = f1_score(y_test, y_pred_nb, pos_label='spam')

print("Multinomial Naive Bayes Model Evaluation:")
print(f"Accuracy: {accuracy_nb:.4f}")
print(f"Precision: {precision_nb:.4f}")
print(f"Recall: {recall_nb:.4f}")
print(f"F1-score: {f1_nb:.4f}")

print("-" * 30)

# Evaluate SVM model (added in a previous step, keeping it)
y_pred_svm = model_svm.predict(X_test_vec)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm, pos_label='spam')
recall_svm = recall_score(y_test, y_pred_svm, pos_label='spam')
f1_svm = f1_score(y_test, y_pred_svm, pos_label='spam')

print("SVM Model Evaluation:")
print(f"Accuracy: {accuracy_svm:.4f}")
print(f"Precision: {precision_svm:.4f}")
print(f"Recall: {recall_svm:.4f}")
print(f"F1-score: {f1_svm:.4f}")

print("-" * 30)

# Evaluate Random Forest model (added in a previous step, keeping it)
y_pred_rf = model_rf.predict(X_test_vec)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, pos_label='spam')
recall_rf = recall_score(y_test, y_pred_rf, pos_label='spam')
f1_rf = f1_score(y_test, y_pred_rf, pos_label='spam')

print("Random Forest Model Evaluation:")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
print(f"F1-score: {f1_rf:.4f}")

print("-" * 30)

# LIME Interpretation
print("LIME Interpretation:")

# Create a LIME explainer
class_names = model_nb.classes_
explainer = lime.lime_text.LimeTextExplainer(class_names=class_names)

# Function to explain a single instance prediction
def explain_instance_lime(text_instance, model, vectorizer, explainer):
    # Create a prediction function for LIME
    def predictor(texts):
        vectors = vectorizer.transform(texts)
        return model.predict_proba(vectors)

    # Explain the instance
    explanation = explainer.explain_instance(text_instance,
                                             predictor,
                                             num_features=6) # You can adjust the number of features

    print(f"\nExplanation for instance: '{text_instance}'")
    for feature, weight in explanation.as_list():
        print(f"  {feature}: {weight:.4f}")
    print("-" * 20)


# Explain a few instances from the test set using each model
print("\nLIME Explanation for Multinomial Naive Bayes:")
for i in range(min(3, len(X_test))): # Explain first 3 instances
     explain_instance_lime(X_test.iloc[i], model_nb, vectorizer, explainer)

print("\nLIME Explanation for SVM:")
for i in range(min(3, len(X_test))): # Explain first 3 instances
     explain_instance_lime(X_test.iloc[i], model_svm, vectorizer, explainer)

print("\nLIME Explanation for Random Forest:")
for i in range(min(3, len(X_test))): # Explain first 3 instances
     explain_instance_lime(X_test.iloc[i], model_rf, vectorizer, explainer)

Primeras 5 filas del conjunto de entrenamiento:


Unnamed: 0,label,text
0,ham,Let's catch up for coffee this weekend.
1,spam,Get rich quick with this amazing investment op...
2,ham,Don't forget to bring the documents for tomorr...
3,spam,Get rich quick with this amazing investment op...
4,ham,Please find the attached report for your review.



Primeras 5 filas del conjunto de prueba:


Unnamed: 0,label,text
0,ham,con motivo del evento anual de integracion de ...
1,ham,sign up in the anual event to win a surprise o...
2,ham,Don't forget to bring the documents for tomorr...
3,ham,Let's catch up for coffee this weekend.
4,ham,Please find the attached report for your review.



Missing values in training data:
label    0
text     0
dtype: int64

Missing values in testing data:
label    0
text     0
dtype: int64

Shape of X_train: (1000,)
Shape of y_train: (1000,)
Shape of X_test: (4002,)
Shape of y_test: (4002,)
Multinomial Naive Bayes model training complete.
SVM model training complete.
Random Forest model training complete.
Multinomial Naive Bayes Model Evaluation:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-score: 1.0000
------------------------------
SVM Model Evaluation:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-score: 1.0000
------------------------------
Random Forest Model Evaluation:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-score: 1.0000
------------------------------
LIME Interpretation:

LIME Explanation for Multinomial Naive Bayes:

Explanation for instance: 'con motivo del evento anual de integracion de TI participa en esta encuesta para poder ganar un boleto al concierto de luis miguel'
  con: 0.0000
  motivo: 0

In [11]:
%pip install lime

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m256.0/275.7 kB[0m [31m7.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=6fa0ad70f131708e5b32fb58f1392722b9502f914144208854f0431e93756975
  Stored in directory: /root/.cache/pip/wheels/85/fa/a3/9c2d44c9f3cd77cf4e533b58900b2bf4487f2a17e8ec212a3d
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1
