In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Reviews.csv')
# Display the first few rows of the dataframe
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [2]:
# Check for missing values
df.isnull().sum()

# Drop rows with missing values
df = df.dropna()

# Remove unnecessary columns
df = df[['Text', 'Score']]

# Basic text preprocessing
import re

def preprocess_text(text):
    # Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

df['Text'] = df['Text'].apply(preprocess_text)
print(df.head())


                                                Text  Score
0  i have bought several of the vitality canned d...      5
1  product arrived labeled as jumbo salted peanut...      1
2  this is a confection that has been around a fe...      4
3  if you are looking for the secret ingredient i...      2
4  great taffy at a great price  there was a wide...      5


In [3]:
# Use 10% of the data for testing
df = df.sample(frac=0.1, random_state=42)
print(f"Using {len(df)} samples for this run.")


Using 9999 samples for this run.


In [4]:
# Convert ratings to binary sentiment
def convert_to_sentiment(score):
    if score > 3:
        return 'Positive'
    elif score < 3:
        return 'Negative'
    else:
        return 'Neutral'

df['Sentiment'] = df['Score'].apply(convert_to_sentiment)

# Filter out neutral reviews
df = df[df['Sentiment'] != 'Neutral']
print(df['Sentiment'].value_counts())

Sentiment
Positive    7718
Negative    1473
Name: count, dtype: int64


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the text
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['Text']).toarray()
y = df['Sentiment'].map({'Positive': 1, 'Negative': 0})

print(X.shape)
print(y.value_counts())


(9191, 5000)
Sentiment
1    7718
0    1473
Name: count, dtype: int64


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)


(7352, 5000) (1839, 5000)
(7352,) (1839,)


In [7]:
from sklearn.svm import SVC

# Train the model
model = SVC(kernel='linear')

# Debugging: Print before fitting the model
print("Starting model training...")
model.fit(X_train, y_train)

# Debugging: Print after fitting the model
print("Model training completed.")


Starting model training...
Model training completed.


In [8]:
from sklearn.metrics import classification_report, accuracy_score

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9059271343121261
              precision    recall  f1-score   support

           0       0.80      0.50      0.61       276
           1       0.92      0.98      0.95      1563

    accuracy                           0.91      1839
   macro avg       0.86      0.74      0.78      1839
weighted avg       0.90      0.91      0.90      1839

