In [4]:
import nltk
from langdetect import detect, LangDetectException
import os

def download_nltk_resources():
    """Download required NLTK resources if not already present."""
    try:
        nltk.download('punkt', quiet=True)
    except Exception as e:
        print(f"Error downloading NLTK resources: {e}")
        return False
    return True

def read_file(file_path):
    """
    Read content from a text file.
    
    Args:
        file_path (str): Path to the text file
    
    Returns:
        str: Content of the file
        None: If there's an error reading the file
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read().strip()
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return None
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def detect_language(text):
    """
    Detect the language of the given text using langdetect.
    
    Args:
        text (str): Text to analyze
    
    Returns:
        tuple: (language_code, confidence_score)
        None: If detection fails
    """
    if not text:
        return None
    
    try:
        # Detect language
        lang_code = detect(text)
        
        # Map language codes to full names
        language_names = {
            'en': 'English',
            'es': 'Spanish',
            'fr': 'French',
            'de': 'German',
            'it': 'Italian',
            'pt': 'Portuguese',
            'nl': 'Dutch',
            'ru': 'Russian',
            'ar': 'Arabic',
            'ja': 'Japanese',
            'ko': 'Korean',
            'ro': 'Romanian',
            'zh-cn': 'Chinese (Simplified)',
            'zh-tw': 'Chinese (Traditional)',
        }
        
        detected_language = language_names.get(lang_code, lang_code)
        return detected_language
        
    except LangDetectException as e:
        print(f"Error detecting language: {e}")
        return None

def main():
    # Initialize NLTK resources
    if not download_nltk_resources():
        return
    
    file_path = 'sample_phrase.txt'
    
    # Read the file
    text = read_file(file_path)
    if text is None:
        return
    
    # Detect language
    detected_language = detect_language(text)
    
    if detected_language:
        print(f"\nInput text: {text}")
        print(f"Detected language: {detected_language}")
    else:
        print("Could not detect the language of the text.")

if __name__ == "__main__":
    main()


Input text: I like cats. I also like dogs. What i do not like is when people sit.
Detected language: English


In [9]:
import re
from collections import Counter
import statistics
from typing import Dict, List, Tuple

class RobustStyleMetrics:
    def __init__(self, text: str):
        """
        Initialize the stylometry analyzer with text.
        
        Args:
            text (str): The input text to analyze
        """
        self.text = text
        self.words = self._tokenize_words()
        self.chars = list(text)
        
    def _tokenize_words(self) -> List[str]:
        """
        Tokenize text into words using a simple but robust method.
        Handles Romanian diacritics properly.
        """
        # Clean the text and split on whitespace and punctuation
        # This regex preserves Romanian characters
        text = self.text.lower()
        words = re.findall(r'[a-zăâîșțA-ZĂÂÎȘȚ]+', text)
        return words

    def word_length_stats(self) -> Dict:
        """Calculate statistics about word lengths."""
        word_lengths = [len(word) for word in self.words]
        if not word_lengths:
            return {"average": 0, "median": 0, "std_dev": 0}
        
        return {
            "average": statistics.mean(word_lengths),
            "median": statistics.median(word_lengths),
            "std_dev": statistics.stdev(word_lengths) if len(word_lengths) > 1 else 0
        }

    def word_frequency(self, top_n: int = 10) -> List[Tuple[str, int]]:
        """Get the frequency of words."""
        return Counter(self.words).most_common(top_n)

    def char_frequency(self) -> Dict[str, float]:
        """Calculate character frequency as percentages."""
        char_count = Counter(self.chars)
        total_chars = len(self.chars)
        return {char: (count/total_chars)*100 
                for char, count in char_count.items()
                if not char.isspace()}

    def basic_stats(self) -> Dict:
        """Calculate basic text statistics."""
        # Simple sentence splitting on .!?
        sentences = re.split('[.!?]+', self.text)
        sentences = [s.strip() for s in sentences if s.strip()]
        
        return {
            "total_characters": len(self.chars),
            "total_words": len(self.words),
            "avg_word_length": sum(len(word) for word in self.words) / len(self.words) if self.words else 0,
            "sentences": len(sentences),
            "avg_words_per_sentence": len(self.words) / len(sentences) if sentences else 0
        }

    def generate_report(self) -> Dict:
        """Generate a complete stylometric report."""
        return {
            "basic_statistics": self.basic_stats(),
            "word_length_statistics": self.word_length_stats(),
            "most_common_words": self.word_frequency(10),
            "character_frequencies": self.char_frequency()
        }

def print_stylometry_report(text: str):
    """
    Print a formatted stylometry report for the given text.
    
    Args:
        text (str): Text to analyze
    """
    analyzer = RobustStyleMetrics(text)
    report = analyzer.generate_report()
    
    print("\n" + "="*50)
    print("RAPORT DE ANALIZĂ STILOMETRICĂ")
    print("="*50 + "\n")
    
    print("STATISTICI DE BAZĂ:")
    print("-" * 20)
    stats_translation = {
        "total_characters": "Total caractere",
        "total_words": "Total cuvinte",
        "avg_word_length": "Lungime medie cuvânt",
        "sentences": "Număr propoziții",
        "avg_words_per_sentence": "Media cuvintelor per propoziție"
    }
    
    for key, value in report['basic_statistics'].items():
        print(f"{stats_translation[key]}: {value:.2f}" if isinstance(value, float) 
              else f"{stats_translation[key]}: {value}")
    
    print("\nSTATISTICI LUNGIME CUVINTE:")
    print("-" * 20)
    length_stats_translation = {
        "average": "Medie",
        "median": "Mediană",
        "std_dev": "Deviație standard"
    }
    for key, value in report['word_length_statistics'].items():
        print(f"{length_stats_translation[key]}: {value:.2f}")
    
    print("\nCELE MAI FRECVENTE CUVINTE:")
    print("-" * 20)
    for word, freq in report['most_common_words']:
        print(f"'{word}': {freq} ori")
    
    print("\nFRECVENȚA CARACTERELOR (%):")
    print("-" * 20)
    for char, freq in sorted(report['character_frequencies'].items()):
        if freq >= 1.0:  # Show only chars with >= 1% frequency
            print(f"'{char}': {freq:.2f}%")

# Example usage
if __name__ == "__main__":
    text = read_file('sample_phrase.txt')
    print_stylometry_report(text)


RAPORT DE ANALIZĂ STILOMETRICĂ

STATISTICI DE BAZĂ:
--------------------
Total caractere: 69
Total cuvinte: 16
Lungime medie cuvânt: 3.19
Număr propoziții: 3
Media cuvintelor per propoziție: 5.33

STATISTICI LUNGIME CUVINTE:
--------------------
Medie: 3.19
Mediană: 4.00
Deviație standard: 1.42

CELE MAI FRECVENTE CUVINTE:
--------------------
'i': 3 ori
'like': 3 ori
'cats': 1 ori
'also': 1 ori
'dogs': 1 ori
'what': 1 ori
'do': 1 ori
'not': 1 ori
'is': 1 ori
'when': 1 ori

FRECVENȚA CARACTERELOR (%):
--------------------
'.': 4.35%
'I': 2.90%
'W': 1.45%
'a': 4.35%
'c': 1.45%
'd': 2.90%
'e': 8.70%
'g': 1.45%
'h': 2.90%
'i': 8.70%
'k': 4.35%
'l': 7.25%
'n': 2.90%
'o': 7.25%
'p': 2.90%
's': 7.25%
't': 5.80%
'w': 1.45%


In [102]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import random
import logging

def download_required_resources():
    """Download required NLTK resources if not already present."""
    # Temporarily disable NLTK download messages
    logging.getLogger('nltk.downloader').disabled = True
    
    resources = ['punkt', 'averaged_perceptron_tagger', 'wordnet']
    for resource in resources:
        try:
            nltk.data.find(f'tokenizers/{resource}' if resource == 'punkt' 
                          else f'taggers/{resource}' if resource == 'averaged_perceptron_tagger'
                          else f'corpora/{resource}')
        except LookupError:
            nltk.download(resource, quiet=True)
    
    # Re-enable NLTK download messages for future operations if needed
    logging.getLogger('nltk.downloader').disabled = False

def get_wordnet_pos(treebank_tag):
    """Convert POS tag to WordNet POS tag."""
    if treebank_tag.startswith('J'):
        return wn.ADJ
    elif treebank_tag.startswith('V'):
        return wn.VERB
    elif treebank_tag.startswith('N'):
        return wn.NOUN
    elif treebank_tag.startswith('R'):
        return wn.ADV
    else:
        return None

def get_alternatives(word, pos):
    """Get synonyms, hypernyms, and negated antonyms for a word."""
    alternatives = []
    
    if not pos:
        return alternatives
    
    # Get synsets for the word with the correct POS
    synsets = wn.synsets(word, pos=pos)
    
    if not synsets:
        return alternatives
    
    # Get synonyms
    for synset in synsets:
        for lemma in synset.lemmas():
            if lemma.name() != word and '_' not in lemma.name():
                alternatives.append(('synonym', lemma.name()))
    
    # Get hypernyms
    for synset in synsets:
        hypernyms = synset.hypernyms()
        for hypernym in hypernyms:
            for lemma in hypernym.lemmas():
                if lemma.name() != word and '_' not in lemma.name():
                    alternatives.append(('hypernym', lemma.name()))
    
    # Get antonyms
    for synset in synsets:
        for lemma in synset.lemmas():
            if lemma.antonyms():
                for antonym in lemma.antonyms():
                    if '_' not in antonym.name():
                        alternatives.append(('antonym', f"not {antonym.name()}"))
    
    return list(set(alternatives))

def generate_variations(phrase, variation_percentage=0.4):
    """Generate variations of the input phrase by replacing words."""
    # Download required resources silently
    download_required_resources()
    
    # Tokenize and POS tag the phrase
    tokens = word_tokenize(phrase)
    tagged = pos_tag(tokens)
    
    # Calculate number of words to replace
    num_replacements = max(1, int(len(tokens) * variation_percentage))
    
    variations = []
    for _ in range(5):  # Generate 5 different variations
        # Make a copy of the tokens
        new_tokens = tokens.copy()
        
        # Randomly select positions to replace
        positions_to_replace = random.sample(range(len(tokens)), num_replacements)
        
        for pos in positions_to_replace:
            word = tokens[pos]
            tag = tagged[pos][1]
            wordnet_pos = get_wordnet_pos(tag)
            
            alternatives = get_alternatives(word.lower(), wordnet_pos)
            if alternatives:
                replacement_type, replacement = random.choice(alternatives)
                # Preserve original capitalization
                if word[0].isupper():
                    replacement = replacement.capitalize()
                new_tokens[pos] = replacement
        
        variation = ' '.join(new_tokens)
        if variation != phrase:  # Only add if different from original
            variations.append(variation)
    
    return variations

def main():
    # Example usage
    test_phrases = [
        "The old house stands far from the busy road",
        "The quick brown fox jumps over the lazy dog",
        "She walks slowly through the beautiful garden"
    ]
    
    for phrase in test_phrases:
        print(f"\nOriginal phrase: {phrase}")
        print("Variations:")
        variations = generate_variations(phrase)
        for i, variation in enumerate(variations, 1):
            print(f"{i}. {variation}")

if __name__ == "__main__":
    main()


Original phrase: The old house stands far from the busy road
Variations:
1. The quondam house rest far from the officious road
2. The old house stands far from the fussy way
3. The old house stands far from the busybodied route
4. The older house stands far from the busy means
5. The old community stands far from the busy route

Original phrase: The quick brown fox jumps over the lazy dog
Variations:
1. The quick brown trickster jump-start over the lazy cuss
2. The speedy brown fox jumps over the slothful dog
3. The speedy brown fox skip over the lazy fellow
4. The quick brownness fox jumps over the lazy hotdog
5. The quick brown canine jumps over the faineant canine

Original phrase: She walks slowly through the beautiful garden
Variations:
1. She walks slowly through the beautiful vegetation
2. She walks slowly through the not ugly garden
3. She score slowly through the beautiful yard
4. She walks not quickly through the beautiful garden
5. She walks tardily through the beautiful ga

In [76]:
import yake  # for keyword extraction
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

def extract_and_generate_phrases(text, num_keywords=5):
    # Initialize keyword extractor
    kw_extractor = yake.KeywordExtractor(
        lan="en",
        n=1,  # unigrams
        dedupLim=0.9,
        top=num_keywords,
        features=None
    )
    
    # Extract keywords
    keywords = kw_extractor.extract_keywords(text)
    
    # Initialize local LLM (BLOOMZ-560M as an example - relatively small but effective)
    model_name = "bigscience/bloomz-560m"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    results = []
    
    for keyword, score in keywords:
        # Create prompt for the LLM
        prompt = f"Context: {text}\n\nTo explain the word {keyword} in simple terms:"
        
        # Generate response using the local LLM
        inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
        
        with torch.no_grad():
            outputs = model.generate(
                inputs["input_ids"],
                max_length=150,
                num_return_sequences=1,
                temperature=0.7,
                top_p=0.9,
                pad_token_id=tokenizer.eos_token_id
            )
        
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_phrase = generated_text.split("in simple terms:")[1].strip()
        
        results.append({
            'keyword': keyword,
            'original_context': text,
            'generated_phrase': generated_phrase
        })
    
    return results

# Example usage
if __name__ == "__main__":
    sample_text = """
    Artificial intelligence is revolutionizing healthcare through improved diagnosis 
    and treatment planning. Machine learning algorithms can analyze medical images 
    and patient data to detect patterns that humans might miss.
    """
    
    results = extract_and_generate_phrases(sample_text)
    
    # Print results
    for result in results:
        print(f"\nKeyword: {result['keyword']}")
        print(f"Generated Phrase: {result['generated_phrase']}")





Keyword: Artificial
Generated Phrase: Artificial intelligence is the ability to learn from the data and make intelligent decisions.

Keyword: planning
Generated Phrase: The process of planning a medical procedure.

Keyword: intelligence
Generated Phrase: Intelligence is the ability to reason, understand, and act on information.

Keyword: revolutionizing
Generated Phrase: AI revolutionizes healthcare through improved diagnosis and treatment planning. Machine learning algorithms can analyze medical images and patient data to detect patterns that humans might miss.

Keyword: healthcare
Generated Phrase: healthcare is the science and technology of medicine and health care.
