In [22]:
import pandas as pd

train_df = pd.read_json('train.json')
train_df

Unnamed: 0,reviews,sentiments
0,I bought this belt for my daughter in-law for ...,1
1,The size was perfect and so was the color. It...,1
2,"Fits and feels good, esp. for doing a swim rac...",1
3,These socks are absolutely the best. I take pi...,1
4,Thank you so much for the speedy delivery they...,1
...,...,...
7396,"I bought these shirts (black, medium) to wear ...",0
7397,"At first, I thought this scarf might not be th...",1
7398,I am very picky when it comes to bras. I want...,1
7399,"This jacket is wind and water resistant, but n...",0


In [23]:
test_df = pd.read_json('test.json')
test_df.head()

Unnamed: 0,reviews
0,I bought 2 sleepers. sleeper had holes in the...
1,I dare say these are just about the sexiest th...
2,"everything about the transaction (price, deliv..."
3,"Not bad for just a shirt. Very durable, and m..."
4,These are truly wrinkle free and longer than t...


In [19]:
# cell 1
import json
import pandas as pd
import numpy as np
import re
import os

# NLP tools
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Ensure necessary NLTK data are available
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jroot\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jroot\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jroot\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [20]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


In [21]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub('[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)



In [24]:
train_df['cleaned_reviews'] = train_df['reviews'].apply(preprocess_text)
test_df['cleaned_reviews'] = test_df['reviews'].apply(preprocess_text)

In [25]:
train_df.head()

Unnamed: 0,reviews,sentiments,cleaned_reviews
0,I bought this belt for my daughter in-law for ...,1,bought belt daughter inlaw christmas loved
1,The size was perfect and so was the color. It...,1,size perfect color looked like web page
2,"Fits and feels good, esp. for doing a swim rac...",1,fit feel good esp swim race highly recommend c...
3,These socks are absolutely the best. I take pi...,1,sock absolutely best take pilate class hot foo...
4,Thank you so much for the speedy delivery they...,1,thank much speedy delivery came time rehearsal...


In [26]:
test_df.head()

Unnamed: 0,reviews,cleaned_reviews
0,I bought 2 sleepers. sleeper had holes in the...,bought sleeper sleeper hole arm pit area sleep...
1,I dare say these are just about the sexiest th...,dare say sexiest thing ive ever worn oh ive gs...
2,"everything about the transaction (price, deliv...",everything transaction price delivery time qua...
3,"Not bad for just a shirt. Very durable, and m...",bad shirt durable matched team color perfectly...
4,These are truly wrinkle free and longer than t...,truly wrinkle free longer average woman botton...


In [27]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df['cleaned_reviews']).toarray()
X_test = vectorizer.transform(test_df['cleaned_reviews']).toarray()

In [30]:

np.save("X_train.npy", X_train)
np.save("X_test.npy", X_test)


In [36]:
# Check type first
print(type(X_train), type(X_test))

# If sparse matrix, convert to array for printing
if hasattr(X_train, "toarray"):
    print("X_train TF-IDF (first 5 rows):")
    print(X_train[:5].toarray())
else:
    print("X_train TF-IDF (first 5 rows):")
    print(X_train[:5])

if hasattr(X_test, "toarray"):
    print("X_test TF-IDF (first 5 rows):")
    print(X_test[:5].toarray())
else:
    print("X_test TF-IDF (first 5 rows):")
    print(X_test[:5])

# Optional: show feature names
print("First 20 TF-IDF feature names:")
print(vectorizer.get_feature_names_out()[:20])


<class 'numpy.ndarray'> <class 'numpy.ndarray'>
X_train TF-IDF (first 5 rows):
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
X_test TF-IDF (first 5 rows):
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
First 20 TF-IDF feature names:
['aa' 'aaa' 'ab' 'abdomen' 'ability' 'abit' 'able' 'abomination'
 'aboveaverage' 'abrasion' 'absolute' 'absolutely' 'absolutley' 'absorb'
 'absorbent' 'absorber' 'absorbs' 'absorption' 'abuse' 'ac']
