<a href="https://colab.research.google.com/github/thesparshpandya/NLP-College-/blob/main/LCAs/NLPLCA1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#SMS Spam Classifier using Decision Trees and Naive Bayes
#Step 1: Load the data into the environment

import numpy as np
import pandas as pd
import io
from google.colab import files

# uploaded the file instead of hardcoding it
print("Please upload your CSV file now...")
uploaded = files.upload()

filename = next(iter(uploaded))

sms_data = pd.read_csv(io.BytesIO(uploaded[filename]), encoding='latin-1')
print("Data Sample:")
print(sms_data.head())

cols = sms_data.columns[:2]
data = sms_data[cols]

print(f"\nShape: {data.shape}")

data = data.rename(columns={"v1": "Value", "v2": "Text"})

print("\nRenamed Data Sample:")
print(data.head())
print("\nValue Counts:")
print(data.Value.value_counts())

Please upload your CSV file now...


Saving spam.csv to spam.csv
Data Sample:
     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  

Shape: (5572, 2)

Renamed Data Sample:
  Value                                               Text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he

In [None]:
import nltk
nltk.download('words')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
#Step 2: Feature Engineering

import re
import nltk
from nltk import word_tokenize

# --- OPTIMIZATION: Load this ONCE outside the loop ---
# Ensure you have the words corpus: nltk.download('words')
ENGLISH_VOCAB = set(w.lower() for w in nltk.corpus.words.words())

# Pre-compiling regex patterns for better performance
PUNCT_RE = re.compile(r"[^\w\s]") # Simplified: matches anything not word/whitespace
PHONE_RE = re.compile(r"[0-9]{10}")
LINK_RE = re.compile(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+")

def find_unusual_words(text):
    # text is already tokenized here
    text_vocab_set = {w.lower() for w in text if w.isalpha()}
    # Set subtraction is very fast, no need to sort just to get the length
    unusual_set = text_vocab_set - ENGLISH_VOCAB
    return len(unusual_set)

# 1. Punctuation count
data["Punctuations"] = data["Text"].apply(lambda x: len(PUNCT_RE.findall(x)))

# 2. Phone numbers
data["Phonenumbers"] = data["Text"].apply(lambda x: len(PHONE_RE.findall(x)))

# 3. Links
data["Links"] = data["Text"].apply(lambda x: 1 if LINK_RE.search(x) else 0)

# 4. Uppercase count (Cleaned up the lambda logic)
data["Uppercase"] = data["Text"].apply(lambda x: sum(1 for word in x.split() if word.isupper()))

# 5. Unusual words (Now using the global ENGLISH_VOCAB)
data["unusualwords"] = data["Text"].apply(lambda x: find_unusual_words(word_tokenize(x)))

print(data[14:25])

   Value                                               Text  Punctuations  \
14   ham                I HAVE A DATE ON SUNDAY WITH WILL!!             2   
15  spam  XXXMobileMovieClub: To use your credit, click ...            11   
16   ham                         Oh k...i'm watching here:)             6   
17   ham  Eh u remember how 2 spell his name... Yes i di...             5   
18   ham  Fine if thatåÕs the way u feel. ThatåÕs the wa...             1   
19  spam  England v Macedonia - dont miss the goals/team...             8   
20   ham          Is that seriously how you spell his name?             1   
21   ham  IÛ÷m going to try for 2 months ha ha only joking             2   
22   ham  So Ì_ pay first lar... Then when is da stock c...             6   
23   ham  Aft i finish my lunch then i go str down lor. ...             3   
24   ham  Ffffffffff. Alright no way I can meet up with ...             2   

    Phonenumbers  Links  Uppercase  unusualwords  
14             0      0 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf= TfidfVectorizer(stop_words="english",strip_accents='ascii',max_features=300)
tf_idf_matrix = tf_idf.fit_transform(data["Text"])


In [None]:
data_extra_features = pd.concat([data,pd.DataFrame(tf_idf_matrix.toarray(),columns=tf_idf.get_feature_names_out())],axis=1)

In [None]:
#Step 3: Machine Learning

from sklearn.model_selection import train_test_split
X=data_extra_features
features = X.columns.drop(["Value","Text"])
target = ["Value"]
X_train,X_test,y_train,y_test = train_test_split(X[features],X[target])

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
dt = DecisionTreeClassifier(min_samples_split=40)
dt.fit(X_train,y_train)
pred = dt.predict(X_test)
print(accuracy_score(y_train, dt.predict(X_train)))
print(accuracy_score(y_test, pred))


0.9868389566882029
0.9676956209619526


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Building a Naive Bayes Model
mnb = MultinomialNB()
mnb.fit(X_train,y_train)
pred_mnb = mnb.predict(X_test)
print(accuracy_score(y_test, pred_mnb))
# Building a Logistic Regression Model
lr = LogisticRegression()
lr.fit(X_train,y_train)
pred_lr = lr.predict(X_test)
print(accuracy_score(y_test, pred_lr))


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.964824120603015
0.968413496051687


In [None]:
#Elementary Analysis
#Step 1: Load the modules into the environment

import pandas as pd
import io
from google.colab import files

print("Please upload your CSV file:")
uploaded = files.upload()

filename = next(iter(uploaded))

df = pd.read_csv(io.BytesIO(uploaded[filename]), encoding="Latin-1")

print(f"\nSuccessfully loaded '{filename}'!")
print(df.head())

Please upload your CSV file:


Saving brand_reviews.csv to brand_reviews.csv

Successfully loaded 'brand_reviews.csv'!
          Brand                                         TextReview
0  Estee Lauder  This night repair serum is absolutely amazing!...
1      Clinique  I hate the texture of this moisturizer. It is ...
2           MAC  The lipstick color is okay, but it fades way t...
3       Origins  Best face wash I have ever used. Smells great ...
4   Bobbi Brown  The foundation matched my skin tone perfectly,...


In [None]:
# download vader_lexicon using nltk.download()
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
import numpy as np


In [None]:
fileName = "brand_reviews.csv"
column = "TextReview"

Data = pd.read_csv(fileName)
Data = Data.replace(np.nan, ' ', regex=True)

sentences = list(Data[column])



In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

sentiments = []
for sentence in sentences:
    ss = sid.polarity_scores(sentence)
    sentiments.append(ss)


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
#Elementary chatbot using NLP
def preprocessing(text):
    stop_words = ['i', 'is', 'a', 'an', 'the', 'to', 'do', 'can', 'how']

    tokens = text.lower().split()

    filtered_tokens = [word for word in tokens if word not in stop_words]

    return filtered_tokens

pairs = [
    [
        "can i reserve railways booking",
        ["Recently internet reservation facility has started on Indian Railways. The web site http://www.irctc.co.in is operational, wherein you can get the railway reservation done through Credit Cards. For more on Reservation through credit cards click here Internet Reservation"]
    ],
    [
        "another question example",
        ["This is another answer"]
    ]
]

def tellme_bot():
    while(1):
        response = input("Tell Me. [q to quit]>")
        if response == 'q':
            break
        i = 0
        chosen = len(pairs)
        matches = 0
        list_response = preprocessing(response)

        if not list_response:
             print("Please ask a more specific question.")
             continue

        while (i < len(pairs)):
            loc_matches = 0
            x = pairs[i][0] + "  ".join(pairs[i][1])
            list_pair = preprocessing(x)

            for word in list_pair:
                if word in list_response:
                    loc_matches = loc_matches + 1
            if (loc_matches > matches):
                chosen = i
                matches = loc_matches
            i = i + 1

        if (chosen < len(pairs)):
            ans = pairs[chosen][1]
            print(ans[0])
        else:
            print("Unable to answer this question")
        break

tellme_bot()

Tell Me. [q to quit]>q


In [None]:
!pip install gensim
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
from gensim import corpora

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

docs1 = "Sugar causes blood glucose to spike and plummet. Unstable blood sugar often leads to mood swings, fatigue, headaches and cravings for more sugar. Cravings set the stage for a cycle of addiction in which every new hit of sugar makes you feel better temporarily but, a few hours later, results in more cravings and hunger. On the flip side, those who avoid sugar often report having little or no cravings for sugary things and feeling emotionally balanced and energized."
docs2 = "Sugar increases the risk of obesity, diabetes and heart disease. Large-scale studies have shown that the more high-glycemic foods (those that quickly affect blood sugar), including foods containing sugar, a person consumes, the higher his risk for becoming obese and for developing diabetes and heart disease1. Emerging research is also suggesting connections between high-glycemic diets and many different forms of cancer."
docs3 = "Sugar interferes with immune function. Research on human subjects is scant, but animal studies have shown that sugar suppresses immune response5. More research is needed to understand the exact mechanisms; however, we do know that bacteria and yeast feed on sugar and that, when these organisms get out of balance in the body, infections and illness are more likely."
docs4 = "A high-sugar diet often results in chromium deficiency. Its sort of a catch-22. If you consume a lot of sugar and other refined carbohydrates, you probably dont get enough of the trace mineral chromium, and one of chromiums main functions is to help regulate blood sugar. Scientists estimate that 90 percent of Americans dont get enough chromium. Chromium is found in a variety of animal foods, seafood and plant foods. Refining starches and other carbohydrates rob these foods of their chromium supplies."
docs5 = "Sugar accelerates aging. It even contributes to that telltale sign of aging: sagging skin. Some of the sugar you consume, after hitting your bloodstream, ends up attaching itself to proteins, in a process called glycation. These new molecular structures contribute to the loss of elasticity found in aging body tissues, from your skin to your organs and arteries7. The more sugar circulating in your blood, the faster this damage takes hold."
docs6 = "Sugar causes tooth decay. With all the other life-threatening effects of sugar, we sometimes forget the most basic damage it does. When it sits on your teeth, it creates decay more efficiently than any other food substance8. For a strong visual reminder, next time the Tooth Fairy visits, try the old tooth-in-a-glass-of-Coke experiment—the results will surely convince you that sugar isnt good for your pearly whites."
docs7 = "Sugar can cause gum disease, which can lead to heart disease. Increasing evidence shows that chronic infections, such as those that result from periodontal problems, play a role in the development of coronary artery disease9. The most popular theory is that the connection is related to widespread effects from the bodys inflammatory response to infection."
docs8 = "Sugar affects behavior and cognition in children. Though it has been confirmed by millions of parents, most researchers have not been able to show the effect of sugar on childrens behavior. A possible problem with the research is that most of it compared the effects of a sugar-sweetened drink to one containing an artificial sweetener10. It may be that kids react to both real sugar and sugar substitutes, therefore showing no differences in behavior. What about kids ability to learn? Between 1979 and 1983, 803 New York City public schools reduced the amount of sucrose (table sugar) and eliminated artificial colors, flavors and two preservatives from school lunches and breakfasts. The diet policy changes were followed by a 15.7 percent increase in a national academic ranking (previously, the greatest improvement ever seen had been 1.7 percent)."
docs9 = "Sugar increases stress. When were under stress, our stress hormone levels rise; these chemicals are the bodys fight-or-flight emergency crew, sent out to prepare the body for an attack or an escape. These chemicals are also called into action when blood sugar is low. For example, after a blood-sugar spike (say, from eating a piece of birthday cake), theres a compensatory dive, which causes the body to release stress hormones such as adrenaline, epinephrine and cortisol. One of the main things these hormones do is raise blood sugar, providing the body with a quick energy boost. The problem is, these helpful hormones can make us feel anxious, irritable and shaky."
docs10 = "Sugar takes the place of important nutrients. According to USDA data, people who consume the most sugar have the lowest intakes of essential nutrients––especially vitamin A, vitamin C, folate, vitamin B-12, calcium, phosphorous, magnesium and iron. Ironically, those who consume the most sugar are children and teenagers, the individuals who need these nutrients most12."
docs11 = "Slashing Sugar. Now that you know the negative impacts refined sugar can have on your body and mind, youll want to be more careful about the foods you choose. And the first step is getting educated about where sugar lurks—believe it or not, a food neednt even taste all that sweet for it to be loaded with sugar. When it comes to convenience and packaged foods, let the ingredients label be your guide, and be aware that just because something boasts that it is low in carbs or a diet food, doesnt mean its free of sugar. Atkins products never contain added sugar."

doc_complete = [docs1, docs2, docs3, docs4, docs5, docs6, docs7, docs8, docs9, docs10, docs11]

stop_set = set(stopwords.words('english'))
exclude_set = set(string.punctuation)
lemmatize = WordNetLemmatizer()

def clean_doc(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop_set])
    punc_free = ''.join(i for i in stop_free if i not in exclude_set)
    normalized = " ".join(lemmatize.lemmatize(w) for w in punc_free.split())
    return normalized

cleaned = [clean_doc(doc).split() for doc in doc_complete]

dictionary = corpora.Dictionary(cleaned)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in cleaned]

Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word=dictionary, passes=300)

topics = ldamodel.print_topics(num_topics=5, num_words=5)
for topic in topics:
    print(topic)


Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


(0, '0.050*"sugar" + 0.021*"craving" + 0.016*"behavior" + 0.011*"new" + 0.011*"effect"')
(1, '0.041*"sugar" + 0.018*"infection" + 0.012*"cause" + 0.012*"research" + 0.012*"result"')
(2, '0.049*"chromium" + 0.026*"food" + 0.018*"get" + 0.018*"carbohydrate" + 0.018*"dont"')
(3, '0.043*"sugar" + 0.024*"body" + 0.020*"hormone" + 0.020*"stress" + 0.015*"blood"')
(4, '0.055*"sugar" + 0.037*"food" + 0.013*"heart" + 0.013*"diet" + 0.013*"diabetes"')
