# Sentiment Analysis using TF IDF Features

In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
df = pd.read_csv('test.csv', encoding='latin1')
df=df[['text','sentiment']]
df.head()

Unnamed: 0,text,sentiment
0,Last session of the day http://twitpic.com/67ezh,neutral
1,Shanghai is also really exciting (precisely -...,positive
2,"Recession hit Veronique Branquinho, she has to...",negative
3,happy bday!,positive
4,http://twitpic.com/4w75p - I like it!!,positive


In [13]:
#Pre-Processing the dataset

stop_words = set(stopwords.words('english'))

def preprocess(text):
    # Step 1: Handle non-string entries
    if not isinstance(text, str):
        text = ''
    print(f"Original: {text}")

    # Step 2: Convert to lowercase
    text = text.lower()
    print(f"Lowercased: {text}")

    # Step 3: Tokenize
    tokens = word_tokenize(text)
    print(f"Tokens: {tokens}")

    # Step 4: Remove stopwords and non-alphabetic tokens
    clean_tokens = []
    for w in tokens:
        if w.isalpha() and w not in stop_words:
            clean_tokens.append(w)
    print(f"Clean Tokens: {clean_tokens}")

    # Step 5: Join tokens back into a string
    clean_text = ' '.join(clean_tokens)
    print(f"Final Clean Text: {clean_text}\n{'-'*40}")

    return clean_text

df['text'] = df['text'].fillna('')

for i in range(5):
    preprocess(df.loc[i, 'text'])

df['clean_text'] = df['text'].apply(preprocess)

Original:  soooooo wish i could, but im in school and myspace is completely blocked
Lowercased:  soooooo wish i could, but im in school and myspace is completely blocked
Tokens: ['soooooo', 'wish', 'i', 'could', ',', 'but', 'im', 'in', 'school', 'and', 'myspace', 'is', 'completely', 'blocked']
Clean Tokens: ['soooooo', 'wish', 'could', 'im', 'school', 'myspace', 'completely', 'blocked']
Final Clean Text: soooooo wish could im school myspace completely blocked
----------------------------------------


'soooooo wish could im school myspace completely blocked'

In [8]:
df

Unnamed: 0,text,sentiment,clean_text
0,Last session of the day http://twitpic.com/67ezh,neutral,last session day http
1,Shanghai is also really exciting (precisely -...,positive,shanghai also really exciting precisely skyscr...
2,"Recession hit Veronique Branquinho, she has to...",negative,recession hit veronique branquinho quit compan...
3,happy bday!,positive,happy bday
4,http://twitpic.com/4w75p - I like it!!,positive,http like
...,...,...,...
4810,,,
4811,,,
4812,,,
4813,,,


In [14]:
def map_sentiment(sentiment):
    if sentiment == 'positive':
        return 1
    else:
        return 0

# Apply the function to create the 'label' column

df['label'] = df['sentiment'].apply(map_sentiment)
df.head()

Unnamed: 0,text,sentiment,clean_text,label
0,Last session of the day http://twitpic.com/67ezh,neutral,last session day http,0
1,Shanghai is also really exciting (precisely -...,positive,shanghai also really exciting precisely skyscr...,1
2,"Recession hit Veronique Branquinho, she has to...",negative,recession hit veronique branquinho quit compan...,0
3,happy bday!,positive,happy bday,1
4,http://twitpic.com/4w75p - I like it!!,positive,http like,1


In [35]:
#Feature extraction by TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Create the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=10)

# Step 2: Fit and transform
X_tfidf = vectorizer.fit_transform(df['clean_text'])

# Step 3: View the feature names (vocabulary)
print("Vocabulary / Features learned by the vectorizer:")
print(vectorizer.get_feature_names_out())
print("-" * 50)

# Step 4: Convert to dense array
X = X_tfidf.toarray()

# Step 5: Show the TF-IDF vectors for the first few rows
print("TF-IDF vectors for the first 3 texts:")
for i in range(1):
    print(f"Original Text: {df.loc[i, 'clean_text']}")
    print(f"TF-IDF Vector: {X[i]}")
    print("-" * 50)

# Step 6: Show shapes of final matrices
y = df['label'].values
print("Final shapes:")
print("TF-IDF feature matrix shape:", X.shape)
print("Labels shape:", y.shape)


Vocabulary / Features learned by the vectorizer:
['day' 'get' 'go' 'going' 'good' 'got' 'http' 'know' 'like' 'love']
--------------------------------------------------
TF-IDF vectors for the first 3 texts:
Original Text: last session day http
TF-IDF Vector: [0.68718758 0.         0.         0.         0.         0.
 0.72648003 0.         0.         0.        ]
--------------------------------------------------
Final shapes:
TF-IDF feature matrix shape: (4815, 10)
Labels shape: (4815,)


In [36]:
class SentimentDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_dataset = SentimentDataset(X_train, y_train)
test_dataset = SentimentDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

class SimpleSentimentModel(nn.Module):
    def __init__(self, input_dim):
        super(SimpleSentimentModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 32)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x.squeeze()

model = SimpleSentimentModel(input_dim=X.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

for epoch in range(5):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

model.eval()
correct = 0
total = 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        preds = (outputs > 0.5).float()
        correct += (preds == y_batch).sum().item()
        total += y_batch.size(0)

print(f"Test Accuracy: {correct / total:.2f}")



Epoch 1, Loss: 120.4839
Epoch 2, Loss: 115.1468
Epoch 3, Loss: 115.1005
Epoch 4, Loss: 114.8764
Epoch 5, Loss: 115.5142
Test Accuracy: 0.81


In [37]:
# 💬 Function to predict sentiment of a new sentence
def predict_sentiment(text, model, vectorizer):
    # Preprocess
    tokens = word_tokenize(text.lower())
    tokens = [w for w in tokens if w.isalpha() and w not in stop_words]
    clean_text = ' '.join(tokens)

    # TF-IDF vectorization
    vec = vectorizer.transform([clean_text]).toarray()
    input_tensor = torch.tensor(vec, dtype=torch.float32)

    # Model prediction
    model.eval()
    with torch.no_grad():
        output = model(input_tensor)
        pred = (output > 0.5).float().item()

    sentiment = 'Positive' if pred == 1.0 else 'Negative'
    return sentiment

# 📝 Take input from user at runtime
user_input = input("Enter a sentence to analyze sentiment: ")
print(f"Sentiment: {predict_sentiment(user_input, model, vectorizer)}")


KeyboardInterrupt: Interrupted by user