In [1]:
# question 1. on spam analysis, the nltk, will work only if you have downloaded the required files so do that beofre all this code will give a look up error

import pandas as pd
import numpy as np
import re
import nltk
import os
from nltk.corpus import stopwords
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

nltk_data_path = os.path.join(os.getcwd(), 'nltk_data')
os.makedirs(nltk_data_path, exist_ok=True)
nltk.data.path.append(nltk_data_path)

required_resources = ['punkt', 'stopwords', 'wordnet', 'punkt_tab']
for resource in required_resources:
    try:
        if resource == 'punkt_tab':
            nltk.data.find(f'tokenizers/{resource}')
        else:
            nltk.data.find(resource)
    except LookupError:
        nltk.download(resource, download_dir=nltk_data_path, quiet=True)

df = pd.read_csv('spam.csv', encoding='latin-1', usecols=['v1', 'v2'])
df.columns = ['Label', 'Message']
df = df.dropna(subset=['Message']).reset_index(drop=True)

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if not isinstance(text, str) or text.strip() == "":
        return []
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

df['tokens'] = df['Message'].apply(preprocess_text)

w2v_model = api.load('word2vec-google-news-300')

def message_to_vector(tokens, model, vector_size=300):
    vectors = []
    for word in tokens:
        if word in model.key_to_index:
            vectors.append(model[word])
    return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)

df['vector'] = df['tokens'].apply(lambda x: message_to_vector(x, w2v_model))

X = np.stack(df['vector'].values)
y = df['Label'].map({'ham': 0, 'spam': 1}).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(f"SMS Spam Detection Accuracy: {accuracy_score(y_test, y_pred):.4f}")

def predict_message_class(message):
    tokens = preprocess_text(message)
    vector = message_to_vector(tokens, w2v_model)
    return 'spam' if clf.predict([vector])[0] == 1 else 'ham'

print("Example Prediction:", predict_message_class("WIN FREE PRIZE! Text YES to claim"))


SMS Spam Detection Accuracy: 0.9435
Example Prediction: spam


In [2]:
# question 2, on sentimental analysis, some which that i am using here are defined in the question 1 so first run that or this code will give error
import pandas as pd
import numpy as np
import re
import contractions
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv('Tweets.csv')
df = df.dropna(subset=['text']).reset_index(drop=True)

lemmatizer = WordNetLemmatizer()

def clean_tweet(text):
    if not isinstance(text, str) or text.strip() == "":
        return []
    text = text.lower()
    text = contractions.fix(text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha()]
    return tokens

df['tokens'] = df['text'].apply(clean_tweet)
df['vector'] = df['tokens'].apply(lambda x: message_to_vector(x, w2v_model))

X = np.stack(df['vector'].values)
y = df['airline_sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2}).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(f"Twitter Sentiment Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

def predict_tweet_sentiment(tweet):
    tokens = clean_tweet(tweet)
    vector = message_to_vector(tokens, w2v_model)
    pred = clf.predict([vector])[0]
    sentiment_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
    return sentiment_map[pred]

print("Test Predictions:")
print("1.", predict_tweet_sentiment("Flight delayed 3 hours. Terrible service!"))
print("2.", predict_tweet_sentiment("Average experience, nothing special"))
print("3.", predict_tweet_sentiment("Loved the onboard service! Crew was amazing"))




Twitter Sentiment Accuracy: 0.7848
              precision    recall  f1-score   support

           0       0.82      0.94      0.87      1889
           1       0.62      0.43      0.51       580
           2       0.78      0.60      0.68       459

    accuracy                           0.78      2928
   macro avg       0.74      0.66      0.69      2928
weighted avg       0.77      0.78      0.77      2928

Test Predictions:
1. negative
2. negative
3. positive
