In [1]:
!git clone https://github.com/sv3t1k/First-ML-Project.git
%cd First-ML-Project
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

Cloning into 'First-ML-Project'...
remote: Enumerating objects: 75, done.[K
remote: Counting objects: 100% (75/75), done.[K
remote: Compressing objects: 100% (71/71), done.[K
remote: Total 75 (delta 16), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (75/75), 170.49 KiB | 2.75 MiB/s, done.
Resolving deltas: 100% (16/16), done.
/content/First-ML-Project
[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'[0m[31m
[0m

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
!pip install -r data/requirements.txt



In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

from data.src.loader import DataLoader
from data.src.preprocessor import TextPreprocessor
from data.src.models import ModelTrainer

print("--- STEP 1: Data Loading ---")
loader = DataLoader("data/train.csv")
df = loader.load_and_clean()

print(f"Number of rows loaded: {len(df)}")
print("\nClass Distribution (Point 4):")
print(df['label'].value_counts())

print("\n--- STEP 2: Text Cleaning (Preprocessing) ---")
preprocessor = TextPreprocessor()

df['cleaned_text'] = df['reviews.text'].astype(str).apply(preprocessor.clean_text)

print("Example before:", df['reviews.text'].iloc[0][:100], "...")
print("Example after:", df['cleaned_text'].iloc[0][:100], "...")

print("\n--- STEP 3: Vectorization (TF-IDF) ---")
trainer = ModelTrainer()
X = trainer.vectorize_data(df['cleaned_text'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

trainer.compare_models(X_train, X_test, y_train, y_test)

final_model = trainer.fine_tune_logistic(X_train, y_train)

print("\n--- STEP 5: Final Metrics ---")
predictions = final_model.predict(X_test)
print(classification_report(y_test, predictions))

--- STEP 1: Data Loading ---
Number of rows loaded: 1664

Class Distribution (Point 4):
label
1    1587
0      77
Name: count, dtype: int64

--- STEP 2: Text Cleaning (Preprocessing) ---
Example before: This product so far has not disappointed. My children love to use it and I like the ability to monit ...
Example after: product far disappointed child love use like ability monitor control content see ease ...

--- STEP 3: Vectorization (TF-IDF) ---

--- Model Comparison ---
Naive Bayes Accuracy: 0.9550
Logistic Regression Accuracy: 0.9550

--- The final model is trained with class balance ---

--- STEP 5: Final Metrics ---
              precision    recall  f1-score   support

           0       0.38      0.20      0.26        15
           1       0.96      0.98      0.97       318

    accuracy                           0.95       333
   macro avg       0.67      0.59      0.62       333
weighted avg       0.94      0.95      0.94       333



In [17]:
import requests
from bs4 import BeautifulSoup

def analyze_web_page(url, model, vectorizer, preprocessor):
    print(f"\n--- Analyzing page: {url} ---")


    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    web_texts = [p.get_text() for p in soup.find_all('p') if len(p.get_text()) > 20]

    if not web_texts:
        print("Text for analysis not found.")
        return

    cleaned_web_texts = [preprocessor.clean_text(t) for t in web_texts]
    web_vectors = vectorizer.transform(cleaned_web_texts)

    predictions = model.predict(web_vectors)
    probabilities = model.predict_proba(web_vectors)


    for i in range(min(5, len(web_texts))):
        sentiment = "POSITIVE" if predictions[i] == 1 else "NEGATIVE"
        conf = probabilities[i][predictions[i]] * 100
        print(f"\nText: {web_texts[i][:100]}...")
        print(f"Result: {sentiment} ({conf:.2f}% confidence)")

test_url = "https://www.gsmarena.com/vivo_iqoo_15_ultra_5g-reviews-14445.php"
analyze_web_page(test_url, final_model, trainer.vectorizer, preprocessor)


--- Analyzing page: https://www.gsmarena.com/vivo_iqoo_15_ultra_5g-reviews-14445.php ---

Text: Sohail , 30 Jan 2026HzRegarding the display refresh rate, Iâ€™m seeing conflicting rumors. Some early ...
Result: POSITIVE (69.69% confidence)

Text: Md. Faim Roze , 15 hours agoMissing 120fps video record option So SadYou are at the mercy of the sen...
Result: POSITIVE (72.23% confidence)

Text: Missing 120fps video record option So Sad...
Result: POSITIVE (69.95% confidence)

Text: As it is known for gaming phone, but with this model, the brand could for an example, implemented ne...
Result: POSITIVE (70.93% confidence)

Text: Amitangshu, 05 Feb 2026It has LTPO. The vailla Iqoo 15 has a LTPO panelCorrect. And the iQOO 15 Ultr...
Result: POSITIVE (64.57% confidence)
