In [1]:
%pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp313-cp313-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tqdm (from nltk)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m71.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:03[0m
[?25hDownloading regex-2024.11.6-cp313-cp313-macosx_11_0_arm64.whl (284 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, click, nltk
Successfully installed click-8.1.8 nltk-3.9.1 regex-2024.11.6 tqdm-4.67.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
import re
import unicodedata
from typing import List, Dict, Union
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Download required NLTK data
nltk.download('punkt')
try:
    nltk.download('stopwords')
except:
    print("Note: Turkish stopwords might not be available in NLTK")

def clean_text(text: str) -> str:
    """
    Clean text by removing Unicode escape sequences and normalizing Unicode characters
    """
    if not isinstance(text, str):
        return ""
    
    # Decode Unicode escape sequences
    text = text.encode('utf-8').decode('unicode-escape')
    
    # Normalize Unicode characters
    text = unicodedata.normalize('NFKC', text)
    
    # Remove special characters and extra whitespace
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip().lower()

def process_json_entry(entry: Dict[str, Union[str, int, float, List]]) -> str:
    """
    Extract and clean text fields from a JSON entry
    """
    # Fields to process - add more if needed
    text_fields = ['clean_caption', 'biography']
    
    # Combine all text fields
    combined_text = ' '.join(str(entry.get(field, '')) for field in text_fields)
    
    return clean_text(combined_text)

def tokenize_turkish_text(text: str) -> List[str]:
    """
    Tokenize Turkish text using NLTK
    """
    return word_tokenize(text)

def process_jsonl_file(file_path: str):
    """
    Process JSONL file and return cleaned, tokenized texts
    """
    processed_texts = []
    
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                try:
                    entry = json.loads(line.strip())
                    cleaned_text = process_json_entry(entry)
                    if cleaned_text:  # Only add non-empty texts
                        processed_texts.append(cleaned_text)
                except json.JSONDecodeError:
                    print(f"Skipping invalid JSON line: {line[:50]}...")
                    continue
    except Exception as e:
        print(f"Error processing file: {str(e)}")
        return []
    
    return processed_texts

def create_tfidf_matrix(texts: List[str]):
    """
    Create TF-IDF matrix from processed texts
    """
    vectorizer = TfidfVectorizer(
        tokenizer=tokenize_turkish_text,
        lowercase=True,
        max_features=5000  # Adjust as needed
    )
    
    return vectorizer, vectorizer.fit_transform(texts)

# Replace with your file path
file_path = "processed_dataset-r3v9.jsonl"

# Process the file
processed_texts = process_jsonl_file(file_path)

if processed_texts:
    print(f"Processed {len(processed_texts)} entries")
    
    # Example of processing first few texts
    print("\nFirst few processed texts:")
    for text in processed_texts[:3]:
        print("\nOriginal:", text)
        print("Tokenized:", tokenize_turkish_text(text))
    
    # Create TF-IDF matrix
    vectorizer, tfidf_matrix = create_tfidf_matrix(processed_texts)
    print(f"\nTF-IDF matrix shape: {tfidf_matrix.shape}")
    
    # Example of getting feature names
    feature_names = vectorizer.get_feature_names_out()
    print(f"\nFirst 10 features: {feature_names[:10]}")

[nltk_data] Downloading package punkt to /Users/yektata/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yektata/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
  text = text.encode('utf-8').decode('unicode-escape')
  text = text.encode('utf-8').decode('unicode-escape')


Processed 94043 entries

First few processed texts:

Original: cumhuriyetimizin 100 yili kutlu olsunï ð 1ð


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/yektata/nltk_data'
    - '/Users/yektata/VSCodeProjects/412project/.venv/nltk_data'
    - '/Users/yektata/VSCodeProjects/412project/.venv/share/nltk_data'
    - '/Users/yektata/VSCodeProjects/412project/.venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [4]:
import json
import re
import unicodedata
from typing import List, Dict, Union
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Download required NLTK data
nltk.download('punkt')
try:
    nltk.download('stopwords')
except:
    print("Note: Turkish stopwords might not be available in NLTK")

nltk.download('punkt_tab')

def clean_text(text: str) -> str:
    """
    Enhanced text cleaning function with better Unicode handling
    """
    if not isinstance(text, str):
        return ""
    
    # Replace common problematic Unicode characters
    text = text.replace('\\u2019', "'")  # Smart quote
    text = text.replace('\\u2018', "'")  # Smart quote
    text = text.replace('\\u201c', '"')  # Smart quote
    text = text.replace('\\u201d', '"')  # Smart quote
    
    # Handle emoji and other special characters
    text = re.sub(r'\\[uU][0-9a-fA-F]{4}', ' ', text)  # Remove Unicode escape sequences
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    
    # Normalize remaining Unicode
    text = unicodedata.normalize('NFKD', text)
    text = text.encode('ASCII', 'ignore').decode('ASCII')
    
    # Remove special characters but keep Turkish letters
    text = re.sub(r'[^a-zA-Z0-9ğüşıöçĞÜŞİÖÇ\s]', ' ', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip().lower()

def process_json_entry(entry: Dict[str, Union[str, int, float, List]]) -> str:
    """
    Extract and clean text fields from a JSON entry
    """
    # Fields to process
    text_fields = ['clean_caption', 'biography']
    
    # Combine all text fields with proper handling of None values
    combined_text = ' '.join(str(entry.get(field, '')) for field in text_fields if entry.get(field))
    
    return clean_text(combined_text)

def tokenize_turkish_text(text: str) -> List[str]:
    """
    Tokenize Turkish text using NLTK
    """
    # Additional preprocessing for Turkish-specific cases
    text = text.replace('i̇', 'i')  # Handle dotted i issues
    return word_tokenize(text)

def process_jsonl_file(file_path: str):
    """
    Process JSONL file and return cleaned, tokenized texts
    """
    processed_texts = []
    error_count = 0
    
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line_num, line in enumerate(file, 1):
                try:
                    entry = json.loads(line.strip())
                    cleaned_text = process_json_entry(entry)
                    if cleaned_text:  # Only add non-empty texts
                        processed_texts.append(cleaned_text)
                except json.JSONDecodeError:
                    error_count += 1
                    print(f"Error parsing JSON at line {line_num}")
                    continue
                except Exception as e:
                    error_count += 1
                    print(f"Error processing line {line_num}: {str(e)}")
                    continue
                
                # Print progress every 10000 lines
                if line_num % 10000 == 0:
                    print(f"Processed {line_num} lines...")
    except Exception as e:
        print(f"Error reading file: {str(e)}")
        return []
    
    print(f"Finished processing with {error_count} errors")
    return processed_texts

def create_tfidf_matrix(texts: List[str]):
    """
    Create TF-IDF matrix from processed texts
    """
    vectorizer = TfidfVectorizer(
        tokenizer=tokenize_turkish_text,
        lowercase=True,
        max_features=5000,  # Adjust as needed
        min_df=5,  # Minimum document frequency
        max_df=0.95  # Maximum document frequency
    )
    
    return vectorizer, vectorizer.fit_transform(texts)

# Replace with your file path
file_path = "processed_dataset-r3v9.jsonl"

# Process the file
processed_texts = process_jsonl_file(file_path)

if processed_texts:
    print(f"\nProcessed {len(processed_texts)} entries")
    
    # Example of processing first few texts
    print("\nFirst few processed texts:")
    for text in processed_texts[:3]:
        print("\nOriginal:", text)
        print("Tokenized:", tokenize_turkish_text(text))
    
    # Create TF-IDF matrix
    vectorizer, tfidf_matrix = create_tfidf_matrix(processed_texts)
    print(f"\nTF-IDF matrix shape: {tfidf_matrix.shape}")
    
    # Example of getting feature names
    feature_names = vectorizer.get_feature_names_out()
    print(f"\nFirst 10 features: {feature_names[:10]}")

[nltk_data] Downloading package punkt to /Users/yektata/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yektata/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/yektata/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Processed 10000 lines...
Processed 20000 lines...
Processed 30000 lines...
Processed 40000 lines...
Processed 50000 lines...
Processed 60000 lines...
Processed 70000 lines...
Processed 80000 lines...
Processed 90000 lines...
Finished processing with 0 errors

Processed 93927 entries

First few processed texts:

Original: cumhuriyetimizin 100 yili kutlu olsun
Tokenized: ['cumhuriyetimizin', '100', 'yili', 'kutlu', 'olsun']

Original: oriflame duologi lansmani
Tokenized: ['oriflame', 'duologi', 'lansmani']

Original: 07agustos 23 oriflameturkiye
Tokenized: ['07agustos', '23', 'oriflameturkiye']





TF-IDF matrix shape: (93927, 5000)

First 10 features: ['0' '00' '000' '0090' '01' '012' '02' '0212' '0216' '0224']


In [6]:
print(f"\nFirst 10 features: {feature_names[:1000]}")


First 10 features: ['0' '00' '000' '0090' '01' '012' '02' '0212' '0216' '0224' '0232' '0242'
 '0252' '0262' '0264' '0266' '03' '0312' '0332' '035' '0352' '0380' '04'
 '0422' '05' '0505' '0506' '0507' '0530' '0531' '0532' '0533' '0534'
 '0535' '0536' '0537' '0538' '0539' '0541' '0542' '0543' '0544' '0545'
 '0546' '0549' '0552' '0553' '0554' '0555' '06' '07' '08' '0850' '09' '1'
 '10' '100' '1000' '100th' '101' '105' '11' '12' '120' '125' '13' '137'
 '14' '140' '15' '150' '1500' '153' '154' '16' '17' '170' '18' '180' '19'
 '1954' '1970' '1978' '1986' '1988' '1994' '1995' '2' '20' '200' '2000'
 '2001' '2002' '2005' '2012' '2014' '2015' '2018' '2019' '2020' '2021'
 '2022' '2023' '2024' '208' '21' '210' '212' '214' '216' '22' '220' '222'
 '223' '224' '23' '230' '232' '234' '236' '24' '241' '242' '243' '246'
 '25' '250' '252' '255' '256' '258' '259' '26' '262' '264' '265' '266'
 '27' '272' '28' '285' '286' '287' '29' '291' '3' '30' '300' '302' '304'
 '31' '311' '312' '317' '318' '32' '321' 

In [8]:
import json
import re
import unicodedata
from typing import List, Dict, Union
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# Previous functions remain the same until create_tfidf_matrix
def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    
    text = text.replace('\\u2019', "'")
    text = text.replace('\\u2018', "'")
    text = text.replace('\\u201c', '"')
    text = text.replace('\\u201d', '"')
    
    text = re.sub(r'\\[uU][0-9a-fA-F]{4}', ' ', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    
    text = unicodedata.normalize('NFKD', text)
    text = text.encode('ASCII', 'ignore').decode('ASCII')
    
    text = re.sub(r'[^a-zA-Z0-9ğüşıöçĞÜŞİÖÇ\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip().lower()

def process_json_entry(entry: Dict[str, Union[str, int, float, List]]) -> str:
    text_fields = ['clean_caption', 'biography']
    combined_text = ' '.join(str(entry.get(field, '')) for field in text_fields if entry.get(field))
    return clean_text(combined_text)

def tokenize_turkish_text(text: str) -> List[str]:
    text = text.replace('i̇', 'i')
    return word_tokenize(text)

def process_jsonl_file(file_path: str) -> tuple:
    """
    Process JSONL file and return cleaned texts and labels
    """
    processed_texts = []
    labels = []
    error_count = 0
    
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line_num, line in enumerate(file, 1):
                try:
                    entry = json.loads(line.strip())
                    cleaned_text = process_json_entry(entry)
                    if cleaned_text:
                        processed_texts.append(cleaned_text)
                        # Get the label (train_category)
                        labels.append(entry.get('train_category', 'unknown'))
                except json.JSONDecodeError:
                    error_count += 1
                    continue
                except Exception as e:
                    error_count += 1
                    continue
                
                if line_num % 10000 == 0:
                    print(f"Processed {line_num} lines...")
    except Exception as e:
        print(f"Error reading file: {str(e)}")
        return [], []
    
    print(f"Finished processing with {error_count} errors")
    return processed_texts, labels

def create_tfidf_matrix(texts: List[str]):
    vectorizer = TfidfVectorizer(
        tokenizer=tokenize_turkish_text,
        lowercase=True,
        max_features=5000,
        min_df=5,
        max_df=0.95
    )
    
    return vectorizer, vectorizer.fit_transform(texts)

def train_and_evaluate_random_forest(X, y, random_state=42):
    """
    Train and evaluate Random Forest model
    """
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=random_state, stratify=y
    )
    
    # Initialize and train the model
    rf_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=random_state,
        n_jobs=-1
    )
    
    # Train the model
    rf_model.fit(X_train, y_train)
    
    # Make predictions
    train_preds = rf_model.predict(X_train)
    test_preds = rf_model.predict(X_test)
    
    # Calculate metrics
    train_accuracy = accuracy_score(y_train, train_preds)
    test_accuracy = accuracy_score(y_test, test_preds)
    
    # Calculate F1 scores (micro for multi-class)
    train_f1 = f1_score(y_train, train_preds, average='micro')
    test_f1 = f1_score(y_test, test_preds, average='micro')
    
    return {
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'train_f1': train_f1,
        'test_f1': test_f1,
        'model': rf_model,
        'feature_importance': rf_model.feature_importances_
    }

# Replace with your file path
file_path = "processed_dataset-r3v10.jsonl"

# Process the file
print("Processing text data...")
processed_texts, labels = process_jsonl_file(file_path)

if processed_texts:
    print(f"\nTotal samples: {len(processed_texts)}")
    print(f"Number of classes: {len(set(labels))}")
    
    # Create TF-IDF matrix
    print("\nCreating TF-IDF matrix...")
    vectorizer, tfidf_matrix = create_tfidf_matrix(processed_texts)
    
    # Train and evaluate model
    print("\nTraining Random Forest model...")
    results = train_and_evaluate_random_forest(tfidf_matrix, labels)
    
    # Print results
    print("\nResults:")
    print(f"Training Accuracy: {results['train_accuracy']:.4f}")
    print(f"Validation Accuracy: {results['test_accuracy']:.4f}")
    print(f"Training F1 Score: {results['train_f1']:.4f}")
    print(f"Validation F1 Score: {results['test_f1']:.4f}")
    
    # Print top features
    feature_names = vectorizer.get_feature_names_out()
    top_features_idx = np.argsort(results['feature_importance'])[-10:]
    print("\nTop 10 most important features:")
    for idx in top_features_idx[::-1]:
        print(f"{feature_names[idx]}: {results['feature_importance'][idx]:.4f}")

Processing text data...
Processed 10000 lines...
Processed 20000 lines...
Processed 30000 lines...
Processed 40000 lines...
Processed 50000 lines...
Processed 60000 lines...
Processed 70000 lines...
Processed 80000 lines...
Processed 90000 lines...
Finished processing with 0 errors

Total samples: 93927
Number of classes: 10

Creating TF-IDF matrix...





Training Random Forest model...

Results:
Training Accuracy: 0.9914
Validation Accuracy: 0.8929
Training F1 Score: 0.9914
Validation F1 Score: 0.8929

Top 10 most important features:
lezzet: 0.0098
ve: 0.0071
teknoloji: 0.0065
com: 0.0059
rezervasyon: 0.0047
siparis: 0.0045
lezzetli: 0.0045
icin: 0.0043
belediye: 0.0040
baskani: 0.0038


In [9]:
%pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.


In [10]:
import json
import re
import unicodedata
from typing import List, Dict, Union
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import pandas as pd

# [Previous functions remain the same until train_and_evaluate_random_forest]

def train_and_evaluate_random_forest(X, y, random_state=42):
    """
    Train and evaluate Random Forest model with confusion matrix
    """
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=random_state, stratify=y
    )
    
    # Initialize and train the model
    rf_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=random_state,
        n_jobs=-1
    )
    
    # Train the model
    rf_model.fit(X_train, y_train)
    
    # Make predictions
    train_preds = rf_model.predict(X_train)
    test_preds = rf_model.predict(X_test)
    
    # Calculate metrics
    train_accuracy = accuracy_score(y_train, train_preds)
    test_accuracy = accuracy_score(y_test, test_preds)
    train_f1 = f1_score(y_train, train_preds, average='micro')
    test_f1 = f1_score(y_test, test_preds, average='micro')
    
    # Create confusion matrix
    cm = confusion_matrix(y_test, test_preds)
    
    # Plot confusion matrix
    plt.figure(figsize=(12, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    
    # Get unique labels for axis labels
    unique_labels = sorted(set(y))
    plt.xticks(np.arange(len(unique_labels)) + 0.5, unique_labels, rotation=45)
    plt.yticks(np.arange(len(unique_labels)) + 0.5, unique_labels, rotation=45)
    plt.tight_layout()
    plt.savefig('confusion_matrix.png')
    plt.close()
    
    return {
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'train_f1': train_f1,
        'test_f1': test_f1,
        'model': rf_model,
        'feature_importance': rf_model.feature_importances_,
        'confusion_matrix': cm,
        'labels': unique_labels
    }

def predict_test_data(model, vectorizer, test_file_path: str):
    """
    Predict categories for test data and aggregate by username
    """
    username_predictions = {}
    username_texts = {}
    
    # Read and process test data
    with open(test_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                entry = json.loads(line.strip())
                username = entry.get('username')
                if not username:
                    continue
                
                # Process text
                text = process_json_entry(entry)
                if text:
                    if username not in username_texts:
                        username_texts[username] = []
                    username_texts[username].append(text)
            except json.JSONDecodeError:
                continue
    
    # Make predictions for each username
    for username, texts in username_texts.items():
        # Transform texts to TF-IDF
        text_features = vectorizer.transform(texts)
        
        # Get predictions for all posts
        predictions = model.predict(text_features)
        
        # Count predictions and handle ties
        pred_counter = Counter(predictions)
        max_count = max(pred_counter.values())
        
        # Get all labels with maximum count
        max_labels = [label for label, count in pred_counter.items() 
                     if count == max_count]
        
        if len(max_labels) == 1:
            # Clear winner
            final_prediction = max_labels[0]
        else:
            # Tie-breaking: use the label with highest confidence scores
            confidences = []
            for label in max_labels:
                # Get indices where this label was predicted
                label_indices = [i for i, pred in enumerate(predictions) if pred == label]
                # Get average confidence for this label
                label_confidences = model.predict_proba(text_features[label_indices])
                avg_confidence = np.mean([conf[list(model.classes_).index(label)] 
                                       for conf in label_confidences])
                confidences.append((label, avg_confidence))
            
            # Choose label with highest average confidence
            final_prediction = max(confidences, key=lambda x: x[1])[0]
        
        username_predictions[username] = final_prediction
    
    return username_predictions

# Training phase
train_file_path = "processed_dataset-r3v10.jsonl"
test_file_path = "processed_dataset-r3v10-test.jsonl"

# Process training data
print("Processing training data...")
processed_texts, labels = process_jsonl_file(train_file_path)

if processed_texts:
    print(f"\nTotal training samples: {len(processed_texts)}")
    print(f"Number of classes: {len(set(labels))}")
    
    # Create TF-IDF matrix
    print("\nCreating TF-IDF matrix...")
    vectorizer, tfidf_matrix = create_tfidf_matrix(processed_texts)
    
    # Train and evaluate model
    print("\nTraining Random Forest model...")
    results = train_and_evaluate_random_forest(tfidf_matrix, labels)
    
    # Print results
    print("\nResults:")
    print(f"Training Accuracy: {results['train_accuracy']:.4f}")
    print(f"Validation Accuracy: {results['test_accuracy']:.4f}")
    print(f"Training F1 Score: {results['train_f1']:.4f}")
    print(f"Validation F1 Score: {results['test_f1']:.4f}")
    
    # Predict test data
    print("\nPredicting test data...")
    predictions = predict_test_data(
        results['model'],
        vectorizer,
        test_file_path
    )
    
    # Save predictions to file
    print("\nSaving predictions...")
    with open('prediction-classification-round3.json', 'w', encoding='utf-8') as f:
        json.dump(predictions, f, ensure_ascii=False, indent=2)
    
    print("\nPredictions saved to 'prediction-classification-round3.json'")
    print("Confusion matrix saved as 'confusion_matrix.png'")

Processing training data...
Processed 10000 lines...
Processed 20000 lines...
Processed 30000 lines...
Processed 40000 lines...
Processed 50000 lines...
Processed 60000 lines...
Processed 70000 lines...
Processed 80000 lines...
Processed 90000 lines...
Finished processing with 0 errors

Total training samples: 93927
Number of classes: 10

Creating TF-IDF matrix...





Training Random Forest model...

Results:
Training Accuracy: 0.9914
Validation Accuracy: 0.8929
Training F1 Score: 0.9914
Validation F1 Score: 0.8929

Predicting test data...

Saving predictions...

Predictions saved to 'prediction-classification-round3.json'
Confusion matrix saved as 'confusion_matrix.png'


In [12]:
def calculate_class_weights(labels):
    """
    Calculate inverse class frequencies for weighting
    """
    class_counts = Counter(labels)
    total_samples = len(labels)
    class_weights = {
        label: total_samples / (len(class_counts) * count)
        for label, count in class_counts.items()
    }
    return class_weights

def predict_test_data(model, vectorizer, test_file_path: str, class_weights: dict):
    """
    Predict categories for test data and aggregate by username with balanced voting
    """
    username_predictions = {}
    username_texts = {}
    
    # Read and process test data
    with open(test_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                entry = json.loads(line.strip())
                username = entry.get('username')
                if not username:
                    continue
                
                # Process text
                text = process_json_entry(entry)
                if text:
                    if username not in username_texts:
                        username_texts[username] = []
                    username_texts[username].append(text)
            except json.JSONDecodeError:
                continue
    
    # Make predictions for each username
    for username, texts in username_texts.items():
        # Transform texts to TF-IDF
        text_features = vectorizer.transform(texts)
        
        # Get predictions for all posts
        predictions = model.predict(text_features)
        
        # Weight the votes based on class distribution
        weighted_votes = Counter()
        for pred in predictions:
            weighted_votes[pred] += class_weights[pred]
        
        # Get maximum weighted vote
        max_weight = max(weighted_votes.values())
        
        # Get all labels with maximum weighted votes
        max_labels = [label for label, weight in weighted_votes.items() 
                     if weight == max_weight]
        
        if len(max_labels) == 1:
            # Clear winner after weighting
            final_prediction = max_labels[0]
        else:
            # Tie-breaking: use the label with highest confidence scores
            # weighted by class distribution
            confidences = []
            for label in max_labels:
                # Get indices where this label was predicted
                label_indices = [i for i, pred in enumerate(predictions) if pred == label]
                # Get average confidence for this label
                label_confidences = model.predict_proba(text_features[label_indices])
                avg_confidence = np.mean([conf[list(model.classes_).index(label)] 
                                       for conf in label_confidences])
                # Weight the confidence by class weight
                weighted_confidence = avg_confidence * class_weights[label]
                confidences.append((label, weighted_confidence))
            
            # Choose label with highest weighted confidence
            final_prediction = max(confidences, key=lambda x: x[1])[0]
        
        username_predictions[username] = final_prediction
    
    return username_predictions

# Training phase
train_file_path = "processed_dataset-r3v10.jsonl"
test_file_path = "processed_dataset-r3v10-test.jsonl"

# Process training data
print("Processing training data...")
processed_texts, labels = process_jsonl_file(train_file_path)

if processed_texts:
    # Calculate class weights
    print("\nCalculating class weights...")
    class_weights = calculate_class_weights(labels)
    print("\nClass distribution weights:")
    for label, weight in class_weights.items():
        print(f"{label}: {weight:.3f}")
    
    # Create TF-IDF matrix
    print("\nCreating TF-IDF matrix...")
    vectorizer, tfidf_matrix = create_tfidf_matrix(processed_texts)
    
    # Train and evaluate model
    print("\nTraining Random Forest model...")
    results = train_and_evaluate_random_forest(tfidf_matrix, labels)
    
    # Print results
    print("\nResults:")
    print(f"Training Accuracy: {results['train_accuracy']:.4f}")
    print(f"Validation Accuracy: {results['test_accuracy']:.4f}")
    print(f"Training F1 Score: {results['train_f1']:.4f}")
    print(f"Validation F1 Score: {results['test_f1']:.4f}")
    
    # Predict test data with balanced voting
    print("\nPredicting test data with balanced voting...")
    predictions = predict_test_data(
        results['model'],
        vectorizer,
        test_file_path,
        class_weights
    )
    
    # Save predictions to file
    print("\nSaving predictions...")
    with open('prediction-classification-round3.json', 'w', encoding='utf-8') as f:
        json.dump(predictions, f, ensure_ascii=False, indent=2)
    
    print("\nPredictions saved to 'prediction-classification-round3.json'")
    print("Confusion matrix saved as 'confusion_matrix.png'")

Processing training data...
Processed 10000 lines...
Processed 20000 lines...
Processed 30000 lines...
Processed 40000 lines...
Processed 50000 lines...
Processed 60000 lines...
Processed 70000 lines...
Processed 80000 lines...
Processed 90000 lines...
Finished processing with 0 errors

Calculating class weights...

Class distribution weights:
tech: 0.796
food: 0.538
health and lifestyle: 0.538
travel: 0.944
sports: 2.432
fashion: 0.909
entertainment: 0.860
mom and children: 1.815
art: 1.442
gaming: 20.643

Creating TF-IDF matrix...





Training Random Forest model...

Results:
Training Accuracy: 0.9914
Validation Accuracy: 0.8929
Training F1 Score: 0.9914
Validation F1 Score: 0.8929

Predicting test data with balanced voting...

Saving predictions...

Predictions saved to 'prediction-classification-round3.json'
Confusion matrix saved as 'confusion_matrix.png'


In [13]:
def train_and_evaluate_random_forest(X, y, random_state=42):
    """
    Train and evaluate Random Forest model with class balancing
    """
    # Calculate class weights
    class_counts = Counter(y)
    total_samples = len(y)
    n_classes = len(class_counts)
    
    # Compute balanced class weights
    class_weights = {
        label: total_samples / (n_classes * count)
        for label, count in class_counts.items()
    }
    
    # Print class distribution
    print("\nClass distribution:")
    for label, count in class_counts.items():
        print(f"{label}: {count} samples ({count/total_samples*100:.2f}%)")
    
    # Split the data with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=random_state, stratify=y
    )
    
    # Initialize and train the model with class weights
    rf_model = RandomForestClassifier(
        n_estimators=200,  # Increased number of trees
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        class_weight=class_weights,  # Add class weights
        random_state=random_state,
        n_jobs=-1,
        bootstrap=True,  # Enable bootstrapping for better balance
        max_samples=0.8  # Use 80% of samples for each tree
    )
    
    # Train the model
    rf_model.fit(X_train, y_train)
    
    # Make predictions
    train_preds = rf_model.predict(X_train)
    test_preds = rf_model.predict(X_test)
    
    # Calculate metrics
    train_accuracy = accuracy_score(y_train, train_preds)
    test_accuracy = accuracy_score(y_test, test_preds)
    
    # Calculate F1 scores (both micro and macro for imbalanced cases)
    train_f1_micro = f1_score(y_train, train_preds, average='micro')
    train_f1_macro = f1_score(y_train, train_preds, average='macro')
    test_f1_micro = f1_score(y_test, test_preds, average='micro')
    test_f1_macro = f1_score(y_test, test_preds, average='macro')
    
    # Create confusion matrix
    cm = confusion_matrix(y_test, test_preds)
    
    # Plot confusion matrix with percentages
    plt.figure(figsize=(12, 8))
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='Blues')
    plt.title('Normalized Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    
    # Get unique labels for axis labels
    unique_labels = sorted(set(y))
    plt.xticks(np.arange(len(unique_labels)) + 0.5, unique_labels, rotation=45)
    plt.yticks(np.arange(len(unique_labels)) + 0.5, unique_labels, rotation=45)
    plt.tight_layout()
    plt.savefig('confusion_matrix.png')
    plt.close()
    
    # Calculate per-class metrics
    class_metrics = {}
    for label in unique_labels:
        class_mask_test = (y_test == label)
        class_preds_test = (test_preds == label)
        
        true_pos = np.sum((y_test == label) & (test_preds == label))
        total_actual = np.sum(y_test == label)
        total_pred = np.sum(test_preds == label)
        
        precision = true_pos / total_pred if total_pred > 0 else 0
        recall = true_pos / total_actual if total_actual > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        class_metrics[label] = {
            'precision': precision,
            'recall': recall,
            'f1': f1
        }
    
    return {
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'train_f1_micro': train_f1_micro,
        'train_f1_macro': train_f1_macro,
        'test_f1_micro': test_f1_micro,
        'test_f1_macro': test_f1_macro,
        'model': rf_model,
        'feature_importance': rf_model.feature_importances_,
        'confusion_matrix': cm,
        'labels': unique_labels,
        'class_weights': class_weights,
        'class_metrics': class_metrics
    }

train_file_path = "processed_dataset-r3v10.jsonl"
test_file_path = "processed_dataset-r3v10-test.jsonl"

print("Processing training data...")
processed_texts, labels = process_jsonl_file(train_file_path)

if processed_texts:
    # Create TF-IDF matrix
    print("\nCreating TF-IDF matrix...")
    vectorizer, tfidf_matrix = create_tfidf_matrix(processed_texts)
    
    # Train and evaluate model
    print("\nTraining balanced Random Forest model...")
    results = train_and_evaluate_random_forest(tfidf_matrix, labels)
    
    # Print detailed results
    print("\nOverall Results:")
    print(f"Training Accuracy: {results['train_accuracy']:.4f}")
    print(f"Validation Accuracy: {results['test_accuracy']:.4f}")
    print(f"Training F1 (micro): {results['train_f1_micro']:.4f}")
    print(f"Training F1 (macro): {results['train_f1_macro']:.4f}")
    print(f"Validation F1 (micro): {results['test_f1_micro']:.4f}")
    print(f"Validation F1 (macro): {results['test_f1_macro']:.4f}")
    
    print("\nPer-class Performance:")
    for label, metrics in results['class_metrics'].items():
        print(f"\nClass: {label}")
        print(f"Precision: {metrics['precision']:.4f}")
        print(f"Recall: {metrics['recall']:.4f}")
        print(f"F1-score: {metrics['f1']:.4f}")
    
    # Predict test data with balanced voting
    print("\nPredicting test data...")
    predictions = predict_test_data(
        results['model'],
        vectorizer,
        test_file_path,
        results['class_weights']
    )
    
    # Save predictions
    print("\nSaving predictions...")
    with open('prediction-classification-round3.json', 'w', encoding='utf-8') as f:
        json.dump(predictions, f, ensure_ascii=False, indent=2)
    
    print("\nPredictions saved to 'prediction-classification-round3.json'")
    print("Confusion matrix saved as 'confusion_matrix.png'")

Processing training data...
Processed 10000 lines...
Processed 20000 lines...
Processed 30000 lines...
Processed 40000 lines...
Processed 50000 lines...
Processed 60000 lines...
Processed 70000 lines...
Processed 80000 lines...
Processed 90000 lines...
Finished processing with 0 errors

Creating TF-IDF matrix...





Training balanced Random Forest model...

Class distribution:
tech: 11801 samples (12.56%)
food: 17462 samples (18.59%)
health and lifestyle: 17449 samples (18.58%)
travel: 9954 samples (10.60%)
sports: 3862 samples (4.11%)
fashion: 10337 samples (11.01%)
entertainment: 10917 samples (11.62%)
mom and children: 5175 samples (5.51%)
art: 6515 samples (6.94%)
gaming: 455 samples (0.48%)

Overall Results:
Training Accuracy: 0.9914
Validation Accuracy: 0.8899
Training F1 (micro): 0.9914
Training F1 (macro): 0.9889
Validation F1 (micro): 0.8899
Validation F1 (macro): 0.8857

Per-class Performance:

Class: art
Precision: 0.0000
Recall: 0.0000
F1-score: 0.0000

Class: entertainment
Precision: 0.0000
Recall: 0.0000
F1-score: 0.0000

Class: fashion
Precision: 0.0000
Recall: 0.0000
F1-score: 0.0000

Class: food
Precision: 0.0000
Recall: 0.0000
F1-score: 0.0000

Class: gaming
Precision: 0.0000
Recall: 0.0000
F1-score: 0.0000

Class: health and lifestyle
Precision: 0.0000
Recall: 0.0000
F1-score: 