In [None]:
import pandas as pd
import mysql.connector
import os

# # ✅ Load Excel File
file_path = r"C:\Users\RUTVIK\Downloads\OneWay_Final_Updated.xlsx"
df = pd.read_excel(file_path)

# # ✅ Keep Only 'Review' and 'Sentiment' Columns (Ensure correct case)
df = df[['Review', 'Sentiment']]

# # ✅ Establish MySQL Connection
# DB_CONFIG = {
#     "host": os.getenv("DB_HOST"),  # Default to localhost if not set
#     "user": os.getenv("DB_USER"),
#     "password": os.getenv("DB_PASSWORD"),  # Replace with actual password
#     "database": "flight"
# }

# conn = mysql.connector.connect(**DB_CONFIG)
# cursor = conn.cursor()

# # ✅ Create Table if Not Exists
# cursor.execute("""
#     CREATE TABLE IF NOT EXISTS Reviews (
#         id INT AUTO_INCREMENT PRIMARY KEY,
#         review TEXT NOT NULL,
#         sentiment ENUM('positive', 'negative', 'neutral') NOT NULL
#     )
# """)

# # ✅ Insert Data into MySQL Table (Batch Insert for efficiency)
# insert_query = "INSERT INTO Reviews (review, sentiment) VALUES (%s, %s)"
# data = list(df.itertuples(index=False, name=None))  # Convert DataFrame to list of tuples

# cursor.executemany(insert_query, data)  # Faster bulk insert

# conn.commit()
# cursor.close()
# conn.close()

# print("✅ Data imported successfully!")


In [2]:
print(df.columns)

Index(['Review', 'Sentiment'], dtype='object')


In [3]:
df["Sentiment"].unique()


array(['positive', 'negative', 'neutral'], dtype=object)

In [4]:
df["Sentiment"].value_counts()


Sentiment
neutral     13634
negative    13511
positive    12834
Name: count, dtype: int64

In [5]:
import os
import re
import numpy as np
import pandas as pd
import joblib
import pickle
import mysql.connector
from sqlalchemy import create_engine
from dotenv import load_dotenv
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dropout


In [8]:
%pip install mysql-connector-python SQLAlchemy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
try:
    with engine.connect() as conn:
        print("Connection successful!")
except Exception as e:
    print(f"Error: {e}")

Connection successful!


In [None]:
# Load environment variables
load_dotenv()
DB_URL = os.getenv("DB_URL")

# Establish database connection using SQLAlchemy
engine = create_engine(DB_URL)

# Obtain raw connection to use cursor functionality
connection = engine.raw_connection()
cursor = connection.cursor()

# Execute a query to fetch flight reviews using the cursor
cursor.execute("SELECT * FROM Reviews")
reviews = cursor.fetchall()

# Convert the fetched reviews into a pandas DataFrame
columns = [desc[0] for desc in cursor.description]  # Get column names
df = pd.DataFrame(reviews, columns=columns)

# Text Cleaning Class
class TextCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.apply(self.clean_text)
    
    @staticmethod
    def clean_text(text):
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

# Apply text cleaning
cleaner = TextCleaner()
df['cleaned_review'] = cleaner.transform(df['review'])

# Encode sentiment labels
sentiment_mapping = {"positive": 1, "negative": 0, "neutral": 2}
df["sentiment"] = df["sentiment"].map(sentiment_mapping)

# Split dataset before transformation
X_texts = df["cleaned_review"].tolist()
y_labels = df["sentiment"].values

X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    X_texts, y_labels, test_size=0.2, random_state=42, stratify=y_labels
)



In [48]:
# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=50000)
X_train_tfidf = vectorizer.fit_transform(X_train_texts)
X_test_tfidf = vectorizer.transform(X_test_texts)

# Save vectorizer
pickle.dump(vectorizer, open("tfidf_vectorizer.pkl", "wb"))

In [47]:
# Train Random Forest Model
# rf_model = RandomForestClassifier(n_estimators=200, max_depth=30, random_state=42)
rf_model = RandomForestClassifier(n_estimators=1000, max_depth=200, min_samples_split=5, random_state=42, bootstrap=True)
rf_model.fit(X_train_tfidf, y_train)
joblib.dump(rf_model, "random_forest.pkl")

['random_forest.pkl']

In [21]:
# Train XGBoost Model
# xgb_model = xgb.XGBClassifier(n_estimators=300, learning_rate=0.1, max_depth=6, eval_metric="mlogloss")
xgb_model = xgb.XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=10, subsample=0.8, reg_lambda=1, eval_metric="mlogloss")

xgb_model.fit(X_train_tfidf, y_train)
joblib.dump(xgb_model, "xgboost.pkl")

['xgboost.pkl']

In [22]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


# Train Deep Learning Model
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_texts)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train_texts), maxlen=100, padding="post")
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test_texts), maxlen=100, padding="post")


model = Sequential([
    Embedding(input_dim=50000, output_dim=128, input_length=100),
    GlobalAveragePooling1D(),
    Dense(256, activation="relu"),
    Dropout(0.4),
    Dense(128, activation="relu"),
    Dropout(0.4),
    Dense(64, activation="relu"),
    Dense(3, activation="softmax")
])

model.compile(optimizer="adamax", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

early_stopping = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
model_checkpoint = ModelCheckpoint("best_tf_sentiment_model.h5", save_best_only=True)

model.fit(X_train_seq, y_train, epochs=20, batch_size=128, validation_data=(X_test_seq, y_test), callbacks=[early_stopping, model_checkpoint])


model.save("tf_sentiment_model.h5")
pickle.dump(tokenizer, open("tokenizer.pkl", "wb"))



Epoch 1/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - accuracy: 0.4957 - loss: 0.9946



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 77ms/step - accuracy: 0.4966 - loss: 0.9934 - val_accuracy: 1.0000 - val_loss: 0.0086
Epoch 2/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.9999 - loss: 0.0113



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 65ms/step - accuracy: 0.9999 - loss: 0.0113 - val_accuracy: 1.0000 - val_loss: 4.0707e-04
Epoch 3/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 1.0000 - loss: 0.0018



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 65ms/step - accuracy: 1.0000 - loss: 0.0018 - val_accuracy: 1.0000 - val_loss: 1.1803e-04
Epoch 4/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 1.0000 - loss: 7.5808e-04



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 70ms/step - accuracy: 1.0000 - loss: 7.5765e-04 - val_accuracy: 1.0000 - val_loss: 5.3497e-05
Epoch 5/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - accuracy: 1.0000 - loss: 3.8778e-04



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 67ms/step - accuracy: 1.0000 - loss: 3.8757e-04 - val_accuracy: 1.0000 - val_loss: 2.7203e-05
Epoch 6/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 1.0000 - loss: 2.2750e-04



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 63ms/step - accuracy: 1.0000 - loss: 2.2743e-04 - val_accuracy: 1.0000 - val_loss: 1.4236e-05
Epoch 7/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 1.0000 - loss: 1.8773e-04



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 64ms/step - accuracy: 1.0000 - loss: 1.8770e-04 - val_accuracy: 1.0000 - val_loss: 9.4735e-06
Epoch 8/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step - accuracy: 1.0000 - loss: 1.0884e-04



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 88ms/step - accuracy: 1.0000 - loss: 1.0882e-04 - val_accuracy: 1.0000 - val_loss: 5.7826e-06
Epoch 9/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - accuracy: 1.0000 - loss: 7.9213e-05



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 71ms/step - accuracy: 1.0000 - loss: 7.9179e-05 - val_accuracy: 1.0000 - val_loss: 3.3658e-06
Epoch 10/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 1.0000 - loss: 5.9214e-05



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 73ms/step - accuracy: 1.0000 - loss: 5.9191e-05 - val_accuracy: 1.0000 - val_loss: 2.0231e-06
Epoch 11/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 1.0000 - loss: 4.1164e-05



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 70ms/step - accuracy: 1.0000 - loss: 4.1158e-05 - val_accuracy: 1.0000 - val_loss: 1.2907e-06
Epoch 12/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 1.0000 - loss: 3.0218e-05



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 69ms/step - accuracy: 1.0000 - loss: 3.0220e-05 - val_accuracy: 1.0000 - val_loss: 8.9071e-07
Epoch 13/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 1.0000 - loss: 2.8767e-05



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 69ms/step - accuracy: 1.0000 - loss: 2.8759e-05 - val_accuracy: 1.0000 - val_loss: 6.8368e-07
Epoch 14/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - accuracy: 1.0000 - loss: 1.6020e-05



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 73ms/step - accuracy: 1.0000 - loss: 1.6019e-05 - val_accuracy: 1.0000 - val_loss: 3.8628e-07
Epoch 15/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 1.0000 - loss: 1.4248e-05



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 77ms/step - accuracy: 1.0000 - loss: 1.4250e-05 - val_accuracy: 1.0000 - val_loss: 3.8464e-07
Epoch 16/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - accuracy: 1.0000 - loss: 9.6512e-06



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 77ms/step - accuracy: 1.0000 - loss: 9.6502e-06 - val_accuracy: 1.0000 - val_loss: 1.7343e-07
Epoch 17/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 1.0000 - loss: 8.1361e-06



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 69ms/step - accuracy: 1.0000 - loss: 8.1344e-06 - val_accuracy: 1.0000 - val_loss: 1.0937e-07
Epoch 18/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - accuracy: 1.0000 - loss: 4.7343e-06



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 70ms/step - accuracy: 1.0000 - loss: 4.7346e-06 - val_accuracy: 1.0000 - val_loss: 5.0048e-08
Epoch 19/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - accuracy: 1.0000 - loss: 5.0447e-06



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 75ms/step - accuracy: 1.0000 - loss: 5.0415e-06 - val_accuracy: 1.0000 - val_loss: 3.0384e-08
Epoch 20/20
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 67ms/step - accuracy: 1.0000 - loss: 9.1756e-06 - val_accuracy: 1.0000 - val_loss: 6.3004e-08




In [23]:
# Evaluate Models
rf_accuracy = accuracy_score(y_test, rf_model.predict(X_test_tfidf))
xgb_accuracy = accuracy_score(y_test, xgb_model.predict(X_test_tfidf))
dl_accuracy = model.evaluate(X_test_seq, y_test, verbose=0)[1]

print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print(f"XGBoost Accuracy: {xgb_accuracy:.4f}")
print(f"Deep Learning Model Accuracy: {dl_accuracy:.4f}")

Random Forest Accuracy: 1.0000
XGBoost Accuracy: 1.0000
Deep Learning Model Accuracy: 1.0000


In [24]:
# Select Best Model
best_model_name = "Random Forest" if rf_accuracy >= xgb_accuracy and rf_accuracy >= dl_accuracy else (
    "XGBoost" if xgb_accuracy >= dl_accuracy else "Deep Learning")

print(f"Best Model: {best_model_name}")

best_model = rf_model if best_model_name == "Random Forest" else (
    xgb_model if best_model_name == "XGBoost" else model)

Best Model: Random Forest


In [25]:

# Sentiment Prediction Pipeline
if best_model_name == "Deep Learning":
    sentiment_pipeline = Pipeline([
        ('text_cleaning', TextCleaner()),
        ('tokenizer', tokenizer),
        ('pad_sequences', None),  # Custom transformer for padding sequences can be added
        ('classifier', best_model)
    ])
else:
    sentiment_pipeline = Pipeline([
        ('text_cleaning', TextCleaner()),
        ('vectorizer', vectorizer),
        ('classifier', best_model)
    ])

joblib.dump(sentiment_pipeline, "sentiment.pkl")

['sentiment.pkl']

In [None]:
# Sentiment Prediction Function
def predict_sentiment(review_text):
    review_text_cleaned = cleaner.clean_text(review_text)
    if best_model_name == "Deep Learning":
        sequence = pad_sequences(tokenizer.texts_to_sequences([review_text_cleaned]), maxlen=100, padding="post")
        prediction = np.argmax(best_model.predict(sequence), axis=1)[0]
    else:
        prediction = best_model.predict(vectorizer.transform([review_text_cleaned]))[0]
    sentiment_labels = {1: "Positive", 0: "Negative", 2: "Neutral"}
    return sentiment_labels.get(prediction, "Unknown")

# Obtain raw connection to use cursor functionality
connection = engine.raw_connection()
cursor = connection.cursor()

# Execute a query to fetch flight reviews for inference
cursor.execute("SELECT * FROM Reviews")
inference_reviews_data = cursor.fetchall()

# Convert the fetched reviews into a pandas DataFrame
columns = [desc[0] for desc in cursor.description]  # Get column names
inference_reviews = pd.DataFrame(inference_reviews_data, columns=columns)

# Apply the sentiment prediction function to the reviews
inference_reviews["Predicted_Sentiment"] = inference_reviews["review"].apply(predict_sentiment)


# Print the DataFrame with predicted sentiments
print(inference_reviews)



                                                  review sentiment  \
0      If you're looking for a great airline, this is...  positive   
1      Loved every moment! The incredible was super W...  positive   
2      Absolutely fantastic! The perfect was check-in...  positive   
3      Avoid this airline! The horrible was aircraft ...  negative   
4      Didn't impress, but didn't disappoint either. ...   neutral   
...                                                  ...       ...   
39974  A huge disappointment. The poor was security a...  negative   
39975  Worst flight ever! The nightmarish was overhea...  negative   
39976  A truly enjoyable flight! The comfortable was ...  positive   
39977  Absolutely fantastic! The perfect was baggage ...  positive   
39978  It was a disaster! The frustrating was bathroo...  negative   

      Predicted_Sentiment  
0                Positive  
1                Positive  
2                Positive  
3                Negative  
4                 N

In [None]:
# Sentiment Prediction Function
def predict_sentiment(review_text):
    review_text_cleaned = cleaner.clean_text(review_text)
    if best_model_name == "Deep Learning":
        sequence = pad_sequences(tokenizer.texts_to_sequences([review_text_cleaned]), maxlen=100, padding="post")
        prediction = np.argmax(best_model.predict(sequence), axis=1)[0]
    else:
        prediction = best_model.predict(vectorizer.transform([review_text_cleaned]))[0]
    sentiment_labels = {1: "Positive", 0: "Negative", 2: "Neutral"}
    return sentiment_labels.get(prediction, "Unknown")

# Obtain raw connection to use cursor functionality
connection = engine.raw_connection()
cursor = connection.cursor()

# Execute a query to fetch flight reviews for inference
cursor.execute("SELECT * FROM Reviews")
inference_reviews_data = cursor.fetchall()

# Convert the fetched reviews into a pandas DataFrame
columns = [desc[0] for desc in cursor.description]  # Get column names
inference_reviews = pd.DataFrame(inference_reviews_data, columns=columns)

# Apply the sentiment prediction function to the reviews
inference_reviews["Predicted_Sentiment"] = inference_reviews["review"].apply(predict_sentiment)


# Print the DataFrame with predicted sentiments
print(inference_reviews)



[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step
Results for Random Forest:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-score: 1.0000
Confusion Matrix:
 [[2702    0    0]
 [   0 2567    0]
 [   0    0 2727]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2702
           1       1.00      1.00      1.00      2567
           2       1.00      1.00      1.00      2727

    accuracy                           1.00      7996
   macro avg       1.00      1.00      1.00      7996
weighted avg       1.00      1.00      1.00      7996

Results for XGBoost:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-score: 1.0000
Confusion Matrix:
 [[2702    0    0]
 [   0 2567    0]
 [   0    0 2727]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2702
           1       1.00      1.00      1.00      2567
   

In [None]:
# Sentiment Prediction Function
def predict_sentiment(review_text):
    review_text_cleaned = cleaner.clean_text(review_text)
    if best_model_name == "Deep Learning":
        sequence = pad_sequences(tokenizer.texts_to_sequences([review_text_cleaned]), maxlen=100, padding="post")
        prediction = np.argmax(best_model.predict(sequence), axis=1)[0]
    else:
        prediction = best_model.predict(vectorizer.transform([review_text_cleaned]))[0]
    sentiment_labels = {1: "Positive", 0: "Negative", 2: "Neutral"}
    return sentiment_labels.get(prediction, "Unknown")

# Obtain raw connection to use cursor functionality
connection = engine.raw_connection()
cursor = connection.cursor()

# Execute a query to fetch flight reviews for inference
cursor.execute("SELECT * FROM Reviews")
inference_reviews_data = cursor.fetchall()

# Convert the fetched reviews into a pandas DataFrame
columns = [desc[0] for desc in cursor.description]  # Get column names
inference_reviews = pd.DataFrame(inference_reviews_data, columns=columns)

# Apply the sentiment prediction function to the reviews
inference_reviews["Predicted_Sentiment"] = inference_reviews["review"].apply(predict_sentiment)


# Print the DataFrame with predicted sentiments
print(inference_reviews)



                                              reviews  predicted_sentiment
0      The boarding process was smooth and efficient.                    1
1         The in-flight WiFi was slow and unreliable.                    0
2   The staff went above and beyond to make sure w...                    1
3           The seats were cramped and uncomfortable.                    0
4                  The flight was quiet and peaceful.                    0
5            The food was tasteless and unappetizing.                    0
6   The cabin crew were very polite and accommodat...                    0
7    I was impressed by the cleanliness of the plane.                    0
8   The flight attendants were rude and unprofessi...                    0
9   The departure and arrival times were accurate ...                    0
10  Exceptional service! The crew was attentive an...                    0
11  The seats were uncomfortable and there was no ...                    0
12          Check-in proc

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# ✅ Text Cleaning Transformer
class TextCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.apply(self.clean_text)
    
    @staticmethod
    def clean_text(text):
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

# ✅ Model Selection Logic (Placeholder)
def select_best_model(rf_acc, xgb_acc, dl_acc):
    """Select the best model based on accuracy."""
    if rf_acc >= xgb_acc and rf_acc >= dl_acc:
        return "Random Forest"
    elif xgb_acc >= dl_acc:
        return "XGBoost"
    else:
        return "Deep Learning"

# ✅ TF-IDF + ML Model Pipeline
ml_pipeline = Pipeline([
    ('text_cleaning', TextCleaner()),
    ('vectorizer', TfidfVectorizer(max_features=50000)),
    ('classifier', None)  # Placeholder for RF/XGBoost model
])

# ✅ Tokenization + Deep Learning Pipeline
dl_pipeline = Pipeline([
    ('text_cleaning', TextCleaner()),
    ('tokenizer', None),  # Placeholder for tokenizer
    ('pad_sequences', None),  # Placeholder for sequence padding
    ('classifier', None)  # Placeholder for DL model
])

# ✅ Sentiment Prediction Logic
def predict_sentiment(review_text, best_model_name):
    """Predict sentiment based on the selected best model."""
    review_text_cleaned = TextCleaner().clean_text(review_text)
    
    if best_model_name == "Deep Learning":
        sequence = pad_sequences(None, maxlen=100, padding="post")  # Placeholder for tokenizer
        prediction = np.argmax(None.predict(sequence), axis=1)[0]  # Placeholder for DL model
    else:
        prediction = None.predict(None.transform([review_text_cleaned]))[0]  # Placeholder for ML model

    sentiment_labels = {1: "Positive", 0: "Negative", 2: "Neutral"}
    return sentiment_labels.get(prediction, "Unknown")


# 1️⃣ Fetch Reviews from Database
# 2️⃣ Preprocess Text (Cleaning)
# 3️⃣ Train TF-IDF + ML Models (RF/XGBoost)
# 4️⃣ Train Tokenizer + Deep Learning Model
# 5️⃣ Evaluate Models & Select Best One
# 6️⃣ Deploy Best Model in Sentiment Prediction Pipeline
 
