In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [15]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from torch.utils.data import Dataset
import logging
import pandas as pd
import numpy as np

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class LicensePlateDataset(Dataset):
    def __init__(self, tokenizer, data_dir="./data", max_samples=100):
        self.tokenizer = tokenizer
        self.data_dir = data_dir
        self.max_samples = max_samples
        self.data = self._load_data()

    def _load_data(self):
        processed_data = []

        # Process each data file with controlled sampling
        for file_name, status in [
            ('accepted-plates.csv', 'ACCEPTED'),
            ('rejected-plates.csv', 'REJECTED'),
            ('red-guide.csv', 'GUIDE')
        ]:
            try:
                file_path = os.path.join(self.data_dir, file_name)
                if os.path.exists(file_path):
                    df = pd.read_csv(file_path)
                    # Use random sampling instead of head()
                    if len(df) > self.max_samples:
                        df = df.sample(n=self.max_samples, random_state=42)

                    for plate in df['plate'].dropna():
                        processed_data.append({
                            'plate': str(plate).upper().strip(),
                            'status': status
                        })
                    print(f"Successfully processed {len(df)} samples from {file_name}")
            except Exception as e:
                print(f"Error processing {file_name}: {str(e)}")
                continue

        return processed_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        # Create a simple, standardized input format
        input_text = f"License plate analysis: {item['plate']}"
        target_text = f" Classification: {item['status']}"

        # Tokenize input separately
        input_encoding = self.tokenizer(
            input_text,
            truncation=True,
            max_length=32,
            padding='max_length',
            return_tensors='pt'
        )

        # Tokenize full sequence for labels
        full_encoding = self.tokenizer(
            input_text + target_text,
            truncation=True,
            max_length=32,
            padding='max_length',
            return_tensors='pt'
        )

        # Create labels with -100 for input tokens
        labels = full_encoding['input_ids'].clone()
        labels[:, :input_encoding['input_ids'].shape[1]] = -100

        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': labels.squeeze()
        }

def train_model():
    print("Initializing optimized training process...")

    # Clear any existing cached memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    # Initialize model and tokenizer
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Load model with optimized settings
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float32,  # Use full precision initially
        low_cpu_mem_usage=True,
        use_cache=False  # Disable cache for gradient checkpointing
    )

    # Create dataset with controlled size
    dataset = LicensePlateDataset(tokenizer, max_samples=100)

    # Configure training arguments for stability
    training_args = TrainingArguments(
        output_dir="trained_model",
        num_train_epochs=1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        logging_steps=10,
        save_steps=100,
        learning_rate=5e-5,
        warmup_steps=10,
        weight_decay=0.01,
        save_total_limit=1,
        report_to="none",
        evaluation_strategy="no",
        save_strategy="epoch",
        fp16=False,  # Disable FP16 training
        gradient_checkpointing=True,
        dataloader_num_workers=0,
        remove_unused_columns=False
    )

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset
    )

    print("Starting training...")
    trainer.train()

    print("Saving model...")
    trainer.save_model("trained_model")
    tokenizer.save_pretrained("trained_model")
    print("Training completed successfully!")

    return "trained_model"

def main():
    try:
        print("Checking GPU memory...")
        if torch.cuda.is_available():
            print(f"GPU Memory available: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

        trained_model_path = train_model()
        print(f"Model saved to: {trained_model_path}")

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    finally:
        # Clean up memory
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

if __name__ == "__main__":
    main()

Checking GPU memory...
GPU Memory available: 42.48 GB
Initializing optimized training process...
Successfully processed 100 samples from accepted-plates.csv
Successfully processed 100 samples from rejected-plates.csv
Successfully processed 100 samples from red-guide.csv




Starting training...


Step,Training Loss
10,0.0


Saving model...
Training completed successfully!
Model saved to: trained_model


In [None]:
import os
import warnings
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
from typing import Dict
import gc

warnings.filterwarnings('ignore', category=UserWarning)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class SavedModelAnalyzer:
    def __init__(self, model_path="trained_model"):
        """Initialize analyzer with saved model."""
        self.model_path = model_path
        self.setup_model()

    def setup_model(self):
        """Load the saved TinyLlama model."""
        logger.info("Loading saved model and tokenizer...")
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token

            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_path,
                torch_dtype=torch.float32,
                low_cpu_mem_usage=True,
                device_map="auto"
            )
            logger.info("Model loaded successfully")

        except Exception as e:
            logger.error(f"Error loading model: {str(e)}")
            raise

    def free_memory(self):
        """Clean up memory."""
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    def analyze_plate(self, plate: str) -> Dict[str, str]:
        """Generate plate analysis using saved model."""
        prompt = f"""<human>: Analyze this license plate: {plate}
Consider these aspects:
1. Is it personalized or random?
2. Is it appropriate/legal?
3. What does it mean?
4. What category (personal, hobby, sports, etc.)?
Provide a brief but complete analysis.

<assistant>: I'll analyze the license plate '{plate}'."""

        try:
            inputs = self.tokenizer(
                prompt,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=128
            ).to(self.model.device)

            with torch.inference_mode():
                outputs = self.model.generate(
                    inputs["input_ids"],
                    max_length=256,
                    temperature=0.7,
                    top_p=0.9,
                    do_sample=True,
                    num_beams=1,
                    early_stopping=True,
                    pad_token_id=self.tokenizer.pad_token_id
                )

            analysis = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            analysis = analysis.split("<assistant>:")[-1].strip()
            self.free_memory()

            return {
                'analysis': analysis,
                'plate': plate
            }

        except Exception as e:
            logger.error(f"Error analyzing plate {plate}: {str(e)}")
            return {'error': str(e), 'plate': plate}

def analyze_plates_interactive():
    """Interactive analysis function using saved model."""
    try:
        print("\nInitializing License Plate Analyzer...")
        print("Loading saved model (this may take a moment)...")
        analyzer = SavedModelAnalyzer()
        print("\nAnalyzer ready!")

        while True:
            print("\n" + "="*50)
            print("License Plate Analysis Tool")
            print("="*50)
            print("\nOptions:")
            print("1. Enter a license plate to analyze")
            print("2. Exit")

            choice = input("\nEnter your choice (1 or 2): ").strip()

            if choice == '2':
                print("\nExiting analyzer. Goodbye!")
                break

            elif choice == '1':
                plate = input("\nEnter license plate to analyze: ").strip().upper()

                if not plate:
                    print("Please enter a valid plate number.")
                    continue

                print("\nAnalyzing plate:", plate)
                print("-" * 50)

                result = analyzer.analyze_plate(plate)

                if 'error' in result:
                    print(f"Error analyzing plate: {result['error']}")
                else:
                    print("\nAnalysis Results:")
                    print(result['analysis'])

                input("\nPress Enter to continue...")

            else:
                print("\nInvalid choice. Please try again.")

            analyzer.free_memory()

    except Exception as e:
        print(f"\nAn error occurred: {str(e)}")
        logger.error(f"Analysis error: {str(e)}")

    finally:
        print("\nCleaning up resources...")
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

if __name__ == "__main__":
    analyze_plates_interactive()


Initializing License Plate Analyzer...
Loading saved model (this may take a moment)...

Analyzer ready!

License Plate Analysis Tool

Options:
1. Enter a license plate to analyze
2. Exit

Enter your choice (1 or 2): 1

Enter license plate to analyze: vishnu

Analyzing plate: VISHNU
--------------------------------------------------

Analysis Results:
I'll analyze the license plate 'VISHNU'.

1. Is it personalized or random?

The license plate 'VISHNU' is personalized, meaning that the license plate number is the same as the driver's name. This is a common practice among Indian drivers to identify themselves by their name.

2. Is it appropriate/legal?

The license plate 'VISHNU' is legal, meaning that it follows the Indian Motor Vehicles Act of 1989. The license plate designates a vehicle as a Vishnu vehicle, which is a religious symbol in Hinduism.

3. What does it mean?

The license plate 'VISHNU' means that the vehicle is owned by Vishnu, a Hindu god.

4. What category
