# Libraries

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

def load_large_dataset(filepath, chunksize=100000):
    print("Loading dataset in chunks...")
    chunks = []
    for chunk in tqdm(pd.read_csv(filepath, chunksize=chunksize), desc="Loading chunks"):
        chunks.append(chunk)
    return pd.concat(chunks, axis=0)
# Preprocessing and feature engineering
def preprocess_data(df, is_training=True):
    print(f"Preprocessing data (Training: {is_training})...")
    # Example preprocessing: Handle missing values
    if is_training:
        df = df.dropna(subset=['Score'])  # Remove rows where target is missing

    # Ensure that 'Score' is categorical (e.g., convert to integers if it's float)
    if 'Score' in df.columns and is_training:
        df['Score'] = df['Score'].astype(int)  # Converting 'Score' to an integer type

    # Drop columns that should not be used for training
    if 'Id' in df.columns:
        df = df.drop(columns=['Id'])  # Dropping ID column

    # Identify non-numeric columns that need encoding
    non_numeric_cols = df.select_dtypes(include=['object']).columns
    for col in non_numeric_cols:
        if col != 'Score':  # Ensure we don't encode the target variable
            # Use Label Encoding for categorical columns
            label_enc = LabelEncoder()
            df[col] = label_enc.fit_transform(df[col].astype(str))
    
    # Example text processing: Use Tfidf if there's a text column
    if 'review_text' in df.columns:
        print("Applying TF-IDF transformation...")
        tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
        if is_training:
            text_features = tfidf.fit_transform(df['review_text']).toarray()
        else:
            text_features = tfidf.transform(df['review_text']).toarray()
        df = pd.concat([df.drop(columns=['review_text']), pd.DataFrame(text_features)], axis=1)
        print("TF-IDF transformation completed.")

    # Handling numerical columns
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    print(f"Standardizing numerical columns: {list(numerical_cols)}")
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    
    return df

# Feature engineering and splitting data
def prepare_data(df):
    print("Splitting data into training and testing sets...")
    X = df.drop(['Score'], axis=1)
    y = df['Score']
    return train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
def train_and_evaluate(X_train, X_test, y_train, y_test):
    print("Training Random Forest model...")
    # Using a Random Forest as an example
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predictions and evaluation
    print("Model training completed. Evaluating...")
    predictions = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, predictions))
    print("Classification Report:\n", classification_report(y_test, predictions))
    
    return model

# Prediction and submission creation
def create_submission(model, test_filepath):
    print("Creating submission file...")
    # Load and preprocess the test dataset
    test_df = pd.read_csv(test_filepath)
    test_df_processed = preprocess_data(test_df, is_training=False)
    
    # Making predictions
    X_submission = test_df_processed.drop(columns=['Score'], errors='ignore')
    X_submission['Score'] = model.predict(X_submission)
    
    # Create the submission file
    submission = X_submission[['Id', 'Score']]
    submission.to_csv("./data/submission.csv", index=False)
    print("Submission file created at './data/submission.csv'")

# Main function to execute the pipeline
def main():
    train_filepath = './data/train.csv'  # Adjusted path to train file
    test_filepath = './data/test.csv'    # Adjusted path to test file
    
    # Load dataset
    print("Starting data loading...")
    df = load_large_dataset(train_filepath)
    print("Data loading completed.")
    
    # Preprocess the data
    df = preprocess_data(df)
    
    # Split the data into training and testing
    X_train, X_test, y_train, y_test = prepare_data(df)
    
    # Train and evaluate the model
    model = train_and_evaluate(X_train, X_test, y_train, y_test)
    
    # Create submission
    create_submission(model, test_filepath)
    
    return model

# Call the main function
if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'textblob'