# Model Explainability and Insights Analysis

In [1]:
## 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
import umap
import joblib

In [2]:
import shap
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import umap
import joblib

class ShapAnalyzer:
    def __init__(self, model, data, feature_names):
        """
        Initialize SHAP Analyzer.
        """
        self.model = model
        self.data = data
        self.feature_names = feature_names
        self.explainer = shap.TreeExplainer(model)

    def get_shap_values(self):
        """Calculate SHAP values for the dataset"""
        return self.explainer.shap_values(self.data)

    def plot_feature_importance(self):
        """Plot global feature importance using SHAP values"""
        shap_values = self.get_shap_values()

        # Debug statements to inspect shap_values
        print("SHAP Values Type:", type(shap_values))
        if isinstance(shap_values, list):
            print("SHAP Values Length:", len(shap_values))
            print("SHAP Values Shape for Class 0:", shap_values[0].shape)
            print("SHAP Values Shape for Class 1:", shap_values[1].shape)
        else:
            print("SHAP Values Shape:", shap_values.shape)
        print("Data Shape:", self.data.shape)

        # Handle binary classification by selecting SHAP values for one class if needed
        if isinstance(shap_values, list) and len(shap_values) == 2:
            shap_values = shap_values[1]  # Select one class for feature importance



   

class JobSimilarityAnalyzer:
    def __init__(self, model_name='bert-base-uncased'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def get_embeddings(self, texts):
        """Get embeddings for a list of texts using BERT"""
        embeddings = []
        for text in texts:
            inputs = self.tokenizer(
                text, return_tensors="pt", truncation=True, max_length=512, padding=True
            )
            with torch.no_grad():
                outputs = self.model(**inputs)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).numpy().flatten())
        return np.array(embeddings)

    def find_similar_jobs(self, df, query_index, num_similar=5):
        """Find similar job postings based on cosine similarity of embeddings"""
        if 'description' not in df.columns:
            raise KeyError("The 'description' column is missing from the DataFrame. Please provide a DataFrame with job descriptions.")
        
        descriptions = df['description'].fillna('')
        embeddings = self.get_embeddings(descriptions)
        similarities = cosine_similarity([embeddings[query_index]], embeddings)[0]
        similar_indices = np.argsort(similarities)[-num_similar:][::-1]

        return pd.DataFrame({
            'Job Index': similar_indices,
            'Similarity Score': similarities[similar_indices],
            'Fraudulent': df.iloc[similar_indices]['fraudulent']
        })

def analyze_fraud_patterns(df, shap_values, feature_names):
    """Analyze patterns in fraudulent job postings based on SHAP values"""
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': np.abs(shap_values).mean(axis=0) if isinstance(shap_values, np.ndarray) else np.abs(shap_values[0]).mean(axis=0)
    }).sort_values('Importance', ascending=False)

    text_features = ['title', 'description', 'requirements']
    for feature in text_features:
        if feature in df.columns:
            print(f"\nCommon words in fraudulent {feature}:")
            fraudulent_text = ' '.join(df[df['fraudulent'] == 1][feature].fillna(''))
            word_freq = pd.Series(fraudulent_text.lower().split()).value_counts()
            print(word_freq.head(10))

    meta_features = ['location', 'employment_type', 'salary_range']
    for feature in meta_features:
        if feature in df.columns:
            print(f"\nDistribution of {feature} in fraudulent posts:")
            print(df[df['fraudulent'] == 1][feature].value_counts(normalize=True).head())

    return importance_df

def plot_fraud_clusters(df, embeddings):
    """Visualize clusters of job postings using UMAP"""
    reducer = umap.UMAP(random_state=42)
    embedding = reducer.fit_transform(embeddings)
    
    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(
        embedding[:, 0],
        embedding[:, 1],
        c=df['fraudulent'],
        cmap='RdYlBu'
    )
    plt.colorbar(scatter)
    plt.title('Job Postings Clustering')
    plt.xlabel('UMAP dimension 1')
    plt.ylabel('UMAP dimension 2')
    plt.show()

# Main execution
if __name__ == "__main__":
    try:
        # Load processed data
        X_train = pd.read_pickle('data/X_train_balanced.pkl')
        X_test = pd.read_pickle('data/X_test.pkl')
        y_train = pd.read_pickle('data/y_train_balanced.pkl')
        y_test = pd.read_pickle('data/y_test.pkl')
        feature_names = X_train.columns.tolist()

        # Load original data if needed for similarity analysis
        original_data = pd.read_pickle('data/processed_data.pkl')  # Ensure this has 'description' column

        # Load model
        stacking_model = joblib.load('models/xgboost_model.joblib')

        # Initialize analyzers
        shap_analyzer = ShapAnalyzer(stacking_model, X_test, feature_names)
        similarity_analyzer = JobSimilarityAnalyzer()

        # Perform SHAP analysis
        shap_analyzer.plot_feature_importance()

        # Analyze similar jobs
        example_idx = 0
        similar_jobs = similarity_analyzer.find_similar_jobs(original_data, example_idx)
        print("\nSimilar Jobs Analysis:")
        print(similar_jobs)

    except FileNotFoundError as e:
        print(f"File not found: {e}")
        print("Ensure all data and model files are in the correct directory.")
    except KeyError as e:
        print(f"Key error: {e}")
        print("Check if the correct columns are available in the DataFrame used.")


SHAP Values Type: <class 'numpy.ndarray'>
SHAP Values Shape: (5364, 120)
Data Shape: (5364, 120)


KeyboardInterrupt: 