# Libraries

In [7]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from lightgbm import LGBMClassifier  # Use LightGBM for improved efficiency
from tqdm import tqdm
from textblob import TextBlob

def load_large_dataset(filepath, chunksize=100000):
    print("Loading dataset in chunks...")
    chunks = []
    for chunk in tqdm(pd.read_csv(filepath, chunksize=chunksize), desc="Loading chunks"):
        chunks.append(chunk)
    return pd.concat(chunks, axis=0)

# Preprocessing and feature engineering
def preprocess_data(df, is_training=True):
    print(f"Preprocessing data (Training: {is_training})...")
    
    # Handle missing values in the target
    if is_training:
        df = df.dropna(subset=['Score'])  # Remove rows where target is missing
    
    # Convert 'Score' to integers
    if 'Score' in df.columns and is_training:
        df['Score'] = df['Score'].astype(int)
    
    # Drop columns that should not be used for training
    if 'Id' in df.columns:
        df = df.drop(columns=['Id'])  # Dropping ID column
    
    # Add new features: sentiment polarity, review length, and additional text features
    if 'review_text' in df.columns:
        print("Extracting new features from review text...")
        df['sentiment_polarity'] = df['review_text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
        df['review_length'] = df['review_text'].apply(lambda x: len(str(x).split()))
        df['special_chars_count'] = df['review_text'].apply(lambda x: sum(1 for char in str(x) if char in ['!', '?']))
        df['avg_word_length'] = df['review_text'].apply(lambda x: np.mean([len(word) for word in str(x).split()]) if str(x).split() else 0)
    
    # Identify non-numeric columns that need encoding
    non_numeric_cols = df.select_dtypes(include=['object']).columns
    for col in non_numeric_cols:
        if col != 'Score':  # Ensure we don't encode the target variable
            label_enc = LabelEncoder()
            df[col] = label_enc.fit_transform(df[col].astype(str))
    
    # Optimized Text Vectorization
    if 'review_text' in df.columns:
        print("Applying optimized text vectorization...")
        hashing = HashingVectorizer(n_features=3000, alternate_sign=False)  # Use HashingVectorizer for faster processing
        text_features = hashing.transform(df['review_text'])
        df = pd.concat([df.drop(columns=['review_text']), pd.DataFrame(text_features.toarray())], axis=1)
        print("Text vectorization completed.")
    
    # Handling numerical columns
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    print(f"Standardizing numerical columns: {list(numerical_cols)}")
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    
    return df

# Feature engineering and splitting data
def prepare_data(df):
    print("Splitting data into training and testing sets...")
    X = df.drop(['Score'], axis=1)
    y = df['Score']
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Model training with LightGBM
def train_and_evaluate(X_train, X_test, y_train, y_test):
    print("Training LightGBM model with adjusted class weights...")
    model = LGBMClassifier(
        n_estimators=150,  # Limited to 150 trees for efficiency
        max_depth=15,  # Limit depth for balanced performance
        learning_rate=0.1,
        class_weight='balanced',  # Adjust for class imbalance
        random_state=42
    )
    model.fit(X_train, y_train)
    
    # Predictions and evaluation
    print("Model training completed. Evaluating...")
    predictions = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, predictions))
    print("Classification Report:\n", classification_report(y_test, predictions))
    
    return model

# Prediction and submission creation
def create_submission(model, test_filepath):
    print("Creating submission file...")
    # Load and preprocess the test dataset
    test_df = pd.read_csv(test_filepath)
    test_df_processed = preprocess_data(test_df, is_training=False)
    
    # Making predictions
    X_submission = test_df_processed.drop(columns=['Score'], errors='ignore')
    X_submission['Score'] = model.predict(X_submission)
    
    # Create the submission file
    submission = X_submission[['Id', 'Score']]
    submission.to_csv("./data/submission.csv", index=False)
    print("Submission file created at './data/submission.csv'")

# Main function to execute the pipeline
def main():
    train_filepath = './data/train.csv'  # Adjusted path to train file
    test_filepath = './data/test.csv'    # Adjusted path to test file
    
    # Load dataset
    print("Starting data loading...")
    df = load_large_dataset(train_filepath)
    print("Data loading completed.")
    
    # Preprocess the data
    df = preprocess_data(df)
    
    # Split the data into training and testing
    X_train, X_test, y_train, y_test = prepare_data(df)
    
    # Train and evaluate the model
    model = train_and_evaluate(X_train, X_test, y_train, y_test)
    
    # Create submission
    create_submission(model, test_filepath)
    
    return model
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the training data in chunks if it's too large
def load_large_dataset(filepath, chunksize=100000):
    print("Loading dataset in chunks...")
    chunks = []
    for chunk in pd.read_csv(filepath, chunksize=chunksize):
        chunks.append(chunk)
        print(f"Loaded {len(chunks) * chunksize} rows so far...")
    return pd.concat(chunks, axis=0)

# Preprocessing and feature engineering
def preprocess_data(df, is_training=True):
    print(f"Preprocessing data (Training: {is_training})...")
    # Example preprocessing: Handle missing values
    if is_training:
        df = df.dropna(subset=['Score'])  # Remove rows where target is missing

    # Ensure that 'Score' is categorical (e.g., convert to integers if it's float)
    if 'Score' in df.columns and is_training:
        df['Score'] = df['Score'].astype(int)  # Converting 'Score' to an integer type

    # Drop columns that should not be used for training
    if 'Id' in df.columns:
        df = df.drop(columns=['Id'])  # Dropping ID column

    # Identify non-numeric columns that need encoding
    non_numeric_cols = df.select_dtypes(include=['object']).columns
    for col in non_numeric_cols:
        if col != 'Score':  # Ensure we don't encode the target variable
            # Use Label Encoding for categorical columns
            label_enc = LabelEncoder()
            df[col] = label_enc.fit_transform(df[col].astype(str))
    
    # Example text processing: Use Tfidf if there's a text column
    if 'review_text' in df.columns:
        print("Applying TF-IDF transformation...")
        tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
        if is_training:
            text_features = tfidf.fit_transform(df['review_text']).toarray()
        else:
            text_features = tfidf.transform(df['review_text']).toarray()
        df = pd.concat([df.drop(columns=['review_text']), pd.DataFrame(text_features)], axis=1)
        print("TF-IDF transformation completed.")

    # Handling numerical columns
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    print(f"Standardizing numerical columns: {list(numerical_cols)}")
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    
    return df

# Feature alignment function
def align_features(X_train, X_submission):
    # Get the training feature names
    train_columns = X_train.columns
    
    # Add any missing columns in the submission set
    for col in train_columns:
        if col not in X_submission:
            X_submission[col] = 0  # Fill missing features with 0
    
    # Ensure the same column order
    X_submission = X_submission[train_columns]
    
    return X_submission

# Feature engineering and splitting data
def prepare_data(df):
    print("Splitting data into training and testing sets...")
    X = df.drop(['Score'], axis=1)
    y = df['Score']
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
def train_and_evaluate(X_train, X_test, y_train, y_test):
    print("Training Random Forest model...")
    # Using a Random Forest as an example
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predictions and evaluation
    print("Model training completed. Evaluating...")
    predictions = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, predictions))
    print("Classification Report:\n", classification_report(y_test, predictions))
    
    return model

# Prediction and submission creation
def create_submission(model, X_train, test_filepath):
    print("Creating submission file...")
    
    # Load and preprocess the test dataset
    test_df = pd.read_csv(test_filepath)
    print(f"Test data loaded with shape: {test_df.shape}")
    
    # Preprocess the test data
    test_df_processed = preprocess_data(test_df, is_training=False)
    print(f"Test data processed with shape: {test_df_processed.shape}")
    
    # Prepare the test data for prediction
    X_submission = test_df_processed.drop(columns=['Score'], errors='ignore')
    X_submission = align_features(X_train, X_submission)  # Align features with training data
    
    # Make predictions
    X_submission['Score'] = model.predict(X_submission)
    
    # Create the submission file
    submission = test_df[['Id']].copy()  # Ensuring the 'Id' column is intact
    submission['Score'] = X_submission['Score']
    submission.to_csv("./data/submission.csv", index=False)
    print("Submission file created at './data/submission.csv'")

# Main function to execute the pipeline
def main():
    train_filepath = './data/train.csv'  # Adjusted path to train file
    test_filepath = './data/test.csv'    # Adjusted path to test file
    
    # Load dataset
    print("Starting data loading...")
    df = load_large_dataset(train_filepath)
    print("Data loading completed.")
    
    # Preprocess the data
    df = preprocess_data(df)
    
    # Split the data into training and testing
    X_train, X_test, y_train, y_test = prepare_data(df)
    
    # Train and evaluate the model
    model = train_and_evaluate(X_train, X_test, y_train, y_test)
    
    # Create submission
    create_submission(model, X_train, test_filepath)
    
    return model

# Call the main function
if __name__ == "__main__":
    main()


Starting data loading...
Loading dataset in chunks...
Loaded 100000 rows so far...
Loaded 200000 rows so far...
Loaded 300000 rows so far...
Loaded 400000 rows so far...
Loaded 500000 rows so far...
Loaded 600000 rows so far...
Loaded 700000 rows so far...
Loaded 800000 rows so far...
Loaded 900000 rows so far...
Loaded 1000000 rows so far...
Loaded 1100000 rows so far...
Loaded 1200000 rows so far...
Loaded 1300000 rows so far...
Loaded 1400000 rows so far...
Loaded 1500000 rows so far...
Loaded 1600000 rows so far...
Loaded 1700000 rows so far...
Data loading completed.
Preprocessing data (Training: True)...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Score'] = df['Score'].astype(int)  # Converting 'Score' to an integer type


Standardizing numerical columns: ['HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time']
Splitting data into training and testing sets...
Training Random Forest model...
Model training completed. Evaluating...
Accuracy: 0.560829302283308
Classification Report:
               precision    recall  f1-score   support

           1       0.49      0.40      0.44     18074
           2       0.26      0.07      0.11     17604
           3       0.33      0.10      0.15     35179
           4       0.37      0.13      0.19     67127
           5       0.60      0.92      0.73    159085

    accuracy                           0.56    297069
   macro avg       0.41      0.32      0.32    297069
weighted avg       0.49      0.56      0.48    297069

Creating submission file...
Test data loaded with shape: (212192, 2)
Preprocessing data (Training: False)...
Standardizing numerical columns: ['Score']
Test data processed with shape: (212192, 1)


  updated_mean = (last_sum + new_sum) / updated_sample_count
  result = op(x, *args, **kwargs)


Submission file created at './data/submission.csv'
Starting data loading...
Loading dataset in chunks...
Loaded 100000 rows so far...
Loaded 200000 rows so far...
Loaded 300000 rows so far...
Loaded 400000 rows so far...
Loaded 500000 rows so far...
Loaded 600000 rows so far...
Loaded 700000 rows so far...
Loaded 800000 rows so far...
Loaded 900000 rows so far...
Loaded 1000000 rows so far...
Loaded 1100000 rows so far...
Loaded 1200000 rows so far...
Loaded 1300000 rows so far...
Loaded 1400000 rows so far...
Loaded 1500000 rows so far...
Loaded 1600000 rows so far...
Loaded 1700000 rows so far...
Data loading completed.
Preprocessing data (Training: True)...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Score'] = df['Score'].astype(int)  # Converting 'Score' to an integer type


Standardizing numerical columns: ['HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time']
Splitting data into training and testing sets...
Training Random Forest model...


KeyboardInterrupt: 