In [1]:
# recommender enginer 4:40 pm 

import pandas as pd
import numpy as np
import json
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Load spaCy model for NLP preprocessing
try:
    nlp = spacy.load("en_core_web_sm")
except:
    # If model not available, download it
    import sys
    import subprocess
    subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")

class XfinityPlanRecommender:
    def __init__(self, data_path, plans_path=None):
        """
        Initialize the recommender system.
        
        Args:
            data_path: Path to the training data JSON
            plans_path: Optional path to Xfinity plans data
        """
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english'))
        
        # Load training data
        with open(data_path, 'r') as f:
            self.data = json.load(f)
            
        # Convert to DataFrame for easier handling
        self.df = pd.DataFrame(self.data)
        
        # Load plans data if provided
        if plans_path:
            with open(plans_path, 'r') as f:
                self.plans_data = json.load(f)
                
        # Prepare the model pipeline
        self.model = None
    
    def preprocess_text(self, text):
        """
        Preprocess text for model input:
        - Lower case
        - Remove special characters
        - Lemmatize words
        - Remove stopwords
        """
        # Convert to lowercase
        text = text.lower()
        
        # Remove special characters
        text = re.sub(r'[^\w\s]', ' ', text)
        
        # Process with spaCy for better entity recognition
        doc = nlp(text)
        
        # Extract key information and lemmatize
        processed_text = []
        for token in doc:
            if token.text not in self.stopwords and not token.is_punct:
                lemma = self.lemmatizer.lemmatize(token.text)
                processed_text.append(lemma)
                
        return ' '.join(processed_text)
    
    def extract_requirements_features(self, row):
        """Extract structured features from the requirements field"""
        features = []
        
        # Extract household size
        features.append(f"household_{row['requirements']['household_size']}")
        
        # Extract device count
        features.append(f"devices_{row['requirements']['device_count']}")
        
        # Extract use cases
        for use_case in row['requirements']['use_cases']:
            features.append(f"usecase_{use_case.replace(' ', '_')}")
            
        # Extract budget level
        features.append(f"budget_{row['requirements']['budget_level']}")
        
        # Extract data usage
        features.append(f"high_data_{row['requirements']['high_data_usage']}")
        
        return ' '.join(features)
    
    def prepare_data(self):
        """Prepare data for training"""
        # Preprocess prompts
        self.df['processed_prompt'] = self.df['prompt'].apply(self.preprocess_text)
        
        # Extract features from requirements if needed
        # This can be used for a multi-input model
        self.df['requirement_features'] = self.df.apply(self.extract_requirements_features, axis=1)
        
        # Combine features (text + extracted features)
        self.df['combined_features'] = self.df['processed_prompt'] + ' ' + self.df['requirement_features']
        
        # Split data
        X = self.df['combined_features']
        y = self.df['plan']
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
    
    def train(self):
        """Train the recommendation model"""
        # Create a pipeline with TF-IDF and Random Forest
        self.model = Pipeline([
            ('vectorizer', TfidfVectorizer(
                analyzer='word',
                max_features=1000,
                ngram_range=(1, 2)  # Include bigrams
            )),
            ('classifier', RandomForestClassifier(
                n_estimators=100,
                random_state=42
            ))
        ])
        
        # Train the model
        self.model.fit(self.X_train, self.y_train)
        
        # Evaluate on test set
        y_pred = self.model.predict(self.X_test)
        accuracy = accuracy_score(self.y_test, y_pred)
        report = classification_report(self.y_test, y_pred)
        
        print(f"Model Accuracy: {accuracy:.4f}")
        print("\nClassification Report:")
        print(report)
        
        return accuracy
    
    def predict(self, user_prompt):
        """
        Predict the best Xfinity plan based on a user prompt
        
        Args:
            user_prompt: String describing user's internet needs
            
        Returns:
            Recommended plan name
        """
        if not self.model:
            raise ValueError("Model not trained yet. Call train() first.")
        
        # Preprocess the input prompt
        processed_prompt = self.preprocess_text(user_prompt)
        
        # Make prediction
        predicted_plan = self.model.predict([processed_prompt])[0]
        
        return predicted_plan
    
    def explain_recommendation(self, user_prompt):
        """
        Provide a plan recommendation with explanation
        
        Args:
            user_prompt: String describing user's internet needs
            
        Returns:
            Dictionary with recommendation and explanation
        """
        plan_name = self.predict(user_prompt)
        
        # Find the plan details from the plans data
        plan_details = None
        if hasattr(self, 'plans_data'):
            for plan in self.plans_data['internet']:
                if plan['name'] == plan_name:
                    plan_details = plan
                    break
        
        # Extract key information from the prompt
        doc = nlp(user_prompt)
        
        # Try to identify key factors in the decision
        factors = []
        
        # Check for keywords related to household size
        if any(word in user_prompt.lower() for word in ['family', 'household', 'people', 'kids']):
            factors.append("household size")
        
        # Check for keywords related to devices
        if any(word in user_prompt.lower() for word in ['device', 'devices', 'connected']):
            factors.append("number of connected devices")
        
        # Check for keywords related to usage
        if any(word in user_prompt.lower() for word in ['stream', 'gaming', 'video', 'work', 'business']):
            factors.append("internet usage patterns")
        
        # Check for keywords related to budget
        if any(word in user_prompt.lower() for word in ['budget', 'affordable', 'cheap', 'expensive', 'price']):
            factors.append("budget considerations")
        
        # Check for keywords related to data usage
        if any(word in user_prompt.lower() for word in ['data', 'cap', 'unlimited']):
            factors.append("data usage requirements")
        
        # Construct explanation
        explanation = f"Based on your {' and '.join(factors)}, the {plan_name} plan would be best for your needs."
        
        result = {
            "recommended_plan": plan_name,
            "explanation": explanation,
            "plan_details": plan_details
        }
        
        return result

# Example usage
if __name__ == "__main__":
    # Create recommender instance
    training_data = '/Users/narengarapati/Desktop/xfinity_training_data_2.json'
    target = '/Users/narengarapati/Desktop/Xfinity_data.json'
    recommender = XfinityPlanRecommender(training_data, target)
    
    # Prepare data and train model
    recommender.prepare_data()
    recommender.train()
    
    # Test with a new prompt
    test_prompt = "We're a family of 4 with 8 devices. We stream HD movies, play online games, and I work from home. We need reliable internet but don't want to break the bank."
    recommendation = recommender.explain_recommendation(test_prompt)
    
    print("\nTest Recommendation:")
    print(f"Prompt: {test_prompt}")
    print(f"Recommended Plan: {recommendation['recommended_plan']}")
    print(f"Explanation: {recommendation['explanation']}")
    if recommendation['plan_details']:
        print(f"Speed: {recommendation['plan_details']['download_speed']}")
        print(f"Price: {recommendation['plan_details']['price']}")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/narengarapati/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/narengarapati/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Model Accuracy: 1.0000

Classification Report:
              precision    recall  f1-score   support

     Connect       1.00      1.00      1.00       135
Connect More       1.00      1.00      1.00       323
        Fast       1.00      1.00      1.00        98
 Gigabit Pro       1.00      1.00      1.00       565
  Gigabit x2       1.00      1.00      1.00        70
   Superfast       1.00      1.00      1.00        93

    accuracy                           1.00      1284
   macro avg       1.00      1.00      1.00      1284
weighted avg       1.00      1.00      1.00      1284


Test Recommendation:
Prompt: We're a family of 4 with 8 devices. We stream HD movies, play online games, and I work from home. We need reliable internet but don't want to break the bank.
Recommended Plan: Gigabit Pro
Explanation: Based on your household size and number of connected devices and internet usage patterns, the Gigabit Pro plan would be best for your needs.
Speed: 6000 Mbps
Price: $299.95


In [2]:
# improved version: 

import pandas as pd
import numpy as np
import json
import re
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

class XfinityPlanRecommenderAdvanced:
    def __init__(self, data_path, plans_path=None):
        """
        Initialize the advanced recommender system using BERT.
        
        Args:
            data_path: Path to the training data JSON
            plans_path: Optional path to Xfinity plans data
        """
        # Load training data
        with open(data_path, 'r') as f:
            self.data = json.load(f)
            
        # Convert to DataFrame for easier handling
        self.df = pd.DataFrame(self.data)
        
        # Load plans data if provided
        if plans_path:
            with open(plans_path, 'r') as f:
                self.plans_data = json.load(f)
        
        # Set up BERT tokenizer and model
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        
        # Check if CUDA is available
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")
        
        # Initialize model as None until we train
        self.model = None
        
    def prepare_data(self):
        """Prepare data for BERT model"""
        # Get unique plan labels
        unique_plans = self.df['plan'].unique()
        self.label_dict = {plan: i for i, plan in enumerate(unique_plans)}
        self.reverse_label_dict = {i: plan for plan, i in self.label_dict.items()}
        
        # Convert plan names to numeric labels
        self.df['label'] = self.df['plan'].map(self.label_dict)
        
        # Combine prompt with structured data for more context
        self.df['combined_text'] = self.df.apply(
            lambda x: f"{x['prompt']} Household size: {x['requirements']['household_size']}. " +
                     f"Devices: {x['requirements']['device_count']}. " +
                     f"Uses: {', '.join(x['requirements']['use_cases'])}. " +
                     f"Budget: {x['requirements']['budget_level']}. " +
                     f"High data usage: {x['requirements']['high_data_usage']}.",
            axis=1
        )
        
        # Create train/validation split
        train_texts, val_texts, train_labels, val_labels = train_test_split(
            self.df['combined_text'].values, 
            self.df['label'].values,
            test_size=0.2,
            random_state=42
        )
        
        # Tokenize and encode sequences
        self.train_encodings = self.tokenizer(
            train_texts.tolist(),
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors='pt'
        )
        
        self.val_encodings = self.tokenizer(
            val_texts.tolist(),
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors='pt'
        )
        
        # Convert to PyTorch datasets
        self.train_dataset = TensorDataset(
            self.train_encodings['input_ids'],
            self.train_encodings['attention_mask'],
            torch.tensor(train_labels)
        )
        
        self.val_dataset = TensorDataset(
            self.val_encodings['input_ids'],
            self.val_encodings['attention_mask'],
            torch.tensor(val_labels)
        )
        
        # Create dataloaders
        self.train_dataloader = DataLoader(
            self.train_dataset,
            sampler=RandomSampler(self.train_dataset),
            batch_size=8
        )
        
        self.val_dataloader = DataLoader(
            self.val_dataset,
            sampler=SequentialSampler(self.val_dataset),
            batch_size=8
        )
        
        print(f"Prepared {len(self.train_dataset)} training samples and {len(self.val_dataset)} validation samples")
        print(f"Label mapping: {self.label_dict}")
    
    def train(self, epochs=4):
        """Train the BERT model"""
        # Initialize the BERT model for sequence classification
        self.model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased',
            num_labels=len(self.label_dict),
            output_attentions=False,
            output_hidden_states=False
        )
        
        self.model.to(self.device)
        
        # Set up optimizer
        optimizer = AdamW(self.model.parameters(), lr=2e-5, eps=1e-8)
        
        # Set up learning rate scheduler
        total_steps = len(self.train_dataloader) * epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )
        
        # Training loop
        for epoch in range(epochs):
            print(f"Starting epoch {epoch+1}/{epochs}")
            
            # Set model to training mode
            self.model.train()
            
            total_loss = 0
            
            # Process batches
            for batch in self.train_dataloader:
                # Add batch to device
                batch = tuple(t.to(self.device) for t in batch)
                input_ids, attention_mask, labels = batch
                
                # Clear gradients
                self.model.zero_grad()
                
                # Forward pass
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
                loss = outputs.loss
                total_loss += loss.item()
                
                # Backward pass
                loss.backward()
                
                # Update parameters
                optimizer.step()
                scheduler.step()
            
            avg_train_loss = total_loss / len(self.train_dataloader)
            print(f"Average training loss: {avg_train_loss}")
            
            # Validation
            self.model.eval()
            
            val_preds = []
            val_true = []
            
            for batch in self.val_dataloader:
                batch = tuple(t.to(self.device) for t in batch)
                input_ids, attention_mask, labels = batch
                
                with torch.no_grad():
                    outputs = self.model(
                        input_ids=input_ids,
                        attention_mask=attention_mask
                    )
                
                logits = outputs.logits
                predictions = torch.argmax(logits, dim=1).cpu().numpy()
                
                val_preds.extend(predictions)
                val_true.extend(labels.cpu().numpy())
            
            accuracy = accuracy_score(val_true, val_preds)
            print(f"Validation Accuracy: {accuracy:.4f}")
            
            # Convert numeric predictions back to plan names for the report
            val_preds_names = [self.reverse_label_dict[p] for p in val_preds]
            val_true_names = [self.reverse_label_dict[t] for t in val_true]
            
            report = classification_report(val_true_names, val_preds_names)
            print("\nClassification Report:")
            print(report)
        
        print("Training complete!")
    
    def predict(self, user_prompt, include_requirements=False, **kwargs):
        """
        Predict the best Xfinity plan based on a user prompt
        
        Args:
            user_prompt: String describing user's internet needs
            include_requirements: Whether to include structured requirements in prediction
            **kwargs: Optional requirements parameters (household_size, device_count, etc.)
            
        Returns:
            Recommended plan name
        """
        if not self.model:
            raise ValueError("Model not trained yet. Call train() first.")
        
        # Process the input text
        if include_requirements and kwargs:
            # Extract requirements from kwargs
            household_size = kwargs.get('household_size', 1)
            device_count = kwargs.get('device_count', 1)
            use_cases = kwargs.get('use_cases', ['basic browsing'])
            budget_level = kwargs.get('budget_level', 'economy')
            high_data_usage = kwargs.get('high_data_usage', False)
            
            # Combine prompt with requirements
            text = f"{user_prompt} Household size: {household_size}. " + \
                   f"Devices: {device_count}. " + \
                   f"Uses: {', '.join(use_cases)}. " + \
                   f"Budget: {budget_level}. " + \
                   f"High data usage: {high_data_usage}."
        else:
            text = user_prompt
        
        # Tokenize the input
        inputs = self.tokenizer(
            text,
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors='pt'
        )
        
        # Move to device
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Make prediction
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1).cpu().numpy()[0]
        
        # Convert numeric prediction to plan name
        predicted_plan = self.reverse_label_dict[prediction]
        
        return predicted_plan
    
    def save_model(self, path):
        """Save the model for later use"""
        if not self.model:
            raise ValueError("No model to save. Train the model first.")
        
        # Save the model
        self.model.save_pretrained(path)
        
        # Save the tokenizer
        self.tokenizer.save_pretrained(path)
        
        # Save label mappings
        with open(f"{path}/label_dict.json", 'w') as f:
            json.dump(self.label_dict, f)
    
    def load_model(self, path):
        """Load a previously saved model"""
        # Load the tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(path)
        
        # Load label mappings
        with open(f"{path}/label_dict.json", 'r') as f:
            self.label_dict = json.load(f)
            
        # Convert string keys to integers for reverse mapping
        self.reverse_label_dict = {int(i): plan for plan, i in self.label_dict.items()}
        
        # Load the model
        self.model = BertForSequenceClassification.from_pretrained(path)
        self.model.to(self.device)
        
        print(f"Model loaded from {path}")

# Example usage with feature extraction from prompt
def extract_features_from_prompt(prompt):
    """
    Attempt to extract structured features from a natural language prompt
    This would be used when we don't have structured requirements data
    """
    import re
    
    features = {
        'household_size': 1,
        'device_count': 1,
        'use_cases': ['basic browsing'],
        'budget_level': 'economy',
        'high_data_usage': False
    }
    
    # Extract household size
    household_match = re.search(r'(\d+)\s+(people|person|family|families|household|households|member|members)', prompt, re.IGNORECASE)
    if household_match:
        features['household_size'] = int(household_match.group(1))
    elif any(word in prompt.lower() for word in ['family', 'families', 'household', 'households']):
        features['household_size'] = 3  # Default for family/household mentions
    
    # Extract device count
    device_match = re.search(r'(\d+)\s+(device|devices|connected)', prompt, re.IGNORECASE)
    if device_match:
        features['device_count'] = int(device_match.group(1))
    
    # Extract use cases
    use_cases = []
    if any(word in prompt.lower() for word in ['stream', 'streaming', 'netflix', 'hulu', 'disney']):
        use_cases.append('streaming')
    if any(word in prompt.lower() for word in ['4k', 'hd', 'high definition', 'high-definition']):
        use_cases.append('4K streaming')
    if any(word in prompt.lower() for word in ['game', 'gaming', 'play']):
        use_cases.append('gaming')
    if any(word in prompt.lower() for word in ['competitive', 'tournament', 'esports']):
        use_cases.append('competitive gaming')
    if any(word in prompt.lower() for word in ['work', 'business', 'webinar', 'zoom', 'meeting']):
        use_cases.append('work from home')
    
    if use_cases:
        features['use_cases'] = use_cases
    
    # Extract budget level
    if any(word in prompt.lower() for word in ['cheap', 'budget', 'affordable', 'save', 'tight']):
        features['budget_level'] = 'economy'
    elif any(word in prompt.lower() for word in ['premium', 'best', 'top', 'high end', 'high-end']):
        features['budget_level'] = 'premium'
    else:
        features['budget_level'] = 'mid_range'
    
    # Extract data usage
    if any(phrase in prompt.lower() for phrase in ['lot of data', 'high data', 'unlimited data', 'data cap']):
        features['high_data_usage'] = True
    
    return features

# Full example usage
if __name__ == "__main__":
    # Create advanced recommender instance
    training_data = '/Users/narengarapati/Desktop/xfinity_training_data_2.json'
    target = '/Users/narengarapati/Desktop/Xfinity_data.json'
    recommender = XfinityPlanRecommenderAdvanced(training_data, target)
    
    # Prepare data and train model
    recommender.prepare_data()
    recommender.train(epochs=3)
    
    # Save the model
    recommender.save_model("xfinity_bert_model")
    
    # Load the model (would be used in production)
    # recommender.load_model("xfinity_bert_model")
    
    # Test with a new prompt
    test_prompt = "We're a family of 4 with 8 devices. We stream HD movies, play online games, and I work from home. We need reliable internet but don't want to break the bank."
    
    # Extract features from the prompt
    features = extract_features_from_prompt(test_prompt)
    print(f"Extracted features: {features}")
    
    # Make prediction using extracted features
    predicted_plan = recommender.predict(
        test_prompt,
        include_requirements=True,
        **features
    )
    
    print(f"\nRecommended Plan: {predicted_plan}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using device: cpu
Prepared 5134 training samples and 1284 validation samples
Label mapping: {'Gigabit x2': 0, 'Connect': 1, 'Connect More': 2, 'Gigabit Pro': 3, 'Fast': 4, 'Superfast': 5}


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting epoch 1/3
Average training loss: 0.2339244163689433
Validation Accuracy: 1.0000

Classification Report:
              precision    recall  f1-score   support

     Connect       1.00      1.00      1.00       135
Connect More       1.00      1.00      1.00       323
        Fast       1.00      1.00      1.00        98
 Gigabit Pro       1.00      1.00      1.00       565
  Gigabit x2       1.00      1.00      1.00        70
   Superfast       1.00      1.00      1.00        93

    accuracy                           1.00      1284
   macro avg       1.00      1.00      1.00      1284
weighted avg       1.00      1.00      1.00      1284

Starting epoch 2/3
Average training loss: 0.006219782250007829
Validation Accuracy: 1.0000

Classification Report:
              precision    recall  f1-score   support

     Connect       1.00      1.00      1.00       135
Connect More       1.00      1.00      1.00       323
        Fast       1.00      1.00      1.00        98
 Gigabit Pro