<a href="https://colab.research.google.com/github/venuaravind-14/VAC-Value_Added_Course-AI-ML/blob/main/Tasks_AIML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Task 1: Name of the task: NLP

In [None]:
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
sentence = "He has affected to render the Military independent of and superior to the Civil power."
print(sentence)

He has affected to render the Military independent of and superior to the Civil power.


In [None]:
(a) Tokenization

In [None]:
tokens = word_tokenize(sentence)
print("Tokens:", tokens)

Tokens: ['He', 'has', 'affected', 'to', 'render', 'the', 'Military', 'independent', 'of', 'and', 'superior', 'to', 'the', 'Civil', 'power', '.']


In [None]:
(b) Stemming

In [None]:
ps = PorterStemmer()
stemmed = [ps.stem(word) for word in tokens]
print("Stemmed Words:", stemmed)

Stemmed Words: ['he', 'ha', 'affect', 'to', 'render', 'the', 'militari', 'independ', 'of', 'and', 'superior', 'to', 'the', 'civil', 'power', '.']


In [None]:
(c) Lemmatization

In [None]:
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
print("Lemmatized Words:", lemmatized)

Lemmatized Words: ['He', 'ha', 'affected', 'to', 'render', 'the', 'Military', 'independent', 'of', 'and', 'superior', 'to', 'the', 'Civil', 'power', '.']


In [None]:
(d) Stopwords Identification

In [None]:
stop_words = set(stopwords.words('english'))
stopword_list = [word for word in tokens if word.lower() in stop_words]

print("Stopwords in Sentence:", stopword_list)

Stopwords in Sentence: ['He', 'has', 'to', 'the', 'of', 'and', 'to', 'the']


In [None]:
Task 2: Vocabulary and Matching

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(sentence)

In [None]:
(a) Named Entities

In [None]:
print("Named Entities:")
for ent in doc.ents:
    print(ent.text, "-", ent.label_)

Named Entities:
Civil - ORG


In [None]:
(b) Noun Chunks

In [None]:
print("\nNoun Chunks:")
for chunk in doc.noun_chunks:
    print(chunk.text)


Noun Chunks:
He
the Military independent
the Civil power


In [None]:
(c) Display Dependency Diagram

In [None]:
from spacy import displacy
displacy.render(doc, style="dep", jupyter=True)

In [None]:
(d) Pattern Matching Example

In [None]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

pattern = [{"LOWER": "military"}, {"LOWER": "independent"}]
matcher.add("MILITARY_PATTERN", [pattern])

matches = matcher(doc)

print("\nPattern Matches:")
for match_id, start, end in matches:
    span = doc[start:end]
    print("Matched:", span.text)


Pattern Matches:
Matched: Military independent


In [None]:
Task 3: Text Classification

In [None]:
import pandas as pd
import nltk
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [None]:
data = pd.read_csv("/content/moviereviews.tsv", sep='\t')

print(data.head())
print(data.columns)

  label                                             review
0   neg  how do films like mouse hunt get into theatres...
1   neg  some talented actresses are blessed with a dem...
2   pos  this has been an extraordinary year for austra...
3   pos  according to hollywood movies made in last few...
4   neg  my first press screening of 1998 and already i...
Index(['label', 'review'], dtype='object')


In [None]:
X = data['review']
y = data['label']

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_cleaned = X.fillna('')
X_vectorized = vectorizer.fit_transform(X_cleaned)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y, test_size=0.2, random_state=42
)

In [None]:
Model 1: Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

lr_pred = lr.predict(X_test)
lr_acc = accuracy_score(y_test, lr_pred)

print("\nLogistic Regression Accuracy:", lr_acc)
print(classification_report(y_test, lr_pred))


Logistic Regression Accuracy: 0.8525
              precision    recall  f1-score   support

         neg       0.87      0.81      0.84       191
         pos       0.84      0.89      0.86       209

    accuracy                           0.85       400
   macro avg       0.85      0.85      0.85       400
weighted avg       0.85      0.85      0.85       400



In [None]:
 Model 2: Linear SVM

In [None]:
svm = SVC()
svm.fit(X_train, y_train)

svm_pred = svm.predict(X_test)
svm_acc = accuracy_score(y_test, svm_pred)

print("\nLinear SVM Accuracy:", svm_acc)
print(classification_report(y_test, svm_pred))


Linear SVM Accuracy: 0.8475
              precision    recall  f1-score   support

         neg       0.85      0.83      0.84       191
         pos       0.85      0.87      0.86       209

    accuracy                           0.85       400
   macro avg       0.85      0.85      0.85       400
weighted avg       0.85      0.85      0.85       400



In [None]:
Model 3: Naive Bayes

In [None]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

nb_pred = nb.predict(X_test)
nb_acc = accuracy_score(y_test, nb_pred)

print("\nNaive Bayes Accuracy:", nb_acc)
print(classification_report(y_test, nb_pred))


Naive Bayes Accuracy: 0.81
              precision    recall  f1-score   support

         neg       0.78      0.84      0.81       191
         pos       0.84      0.78      0.81       209

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.81       400
weighted avg       0.81      0.81      0.81       400



In [None]:
 Final Comparison

In [None]:
print("\n===============================")
print("Final Accuracy Comparison")
print("===============================")

print("Logistic Regression:", lr_acc)
print("Linear SVM:", svm_acc)
print("Naive Bayes:", nb_acc)

best_model = max(
    [("Logistic Regression", lr_acc),
     ("Linear SVM", svm_acc),
     ("Naive Bayes", nb_acc)],
    key=lambda x: x[1]
)

print("\nBest Model:", best_model[0])
print("Highest Accuracy:", best_model[1])


Final Accuracy Comparison
Logistic Regression: 0.8525
Linear SVM: 0.8475
Naive Bayes: 0.81

Best Model: Logistic Regression
Highest Accuracy: 0.8525


In [None]:
Task 4: Sentiment Analysis

In [34]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [35]:
data = pd.read_csv("moviereviews.tsv", sep="\t")
data = data.dropna()

print(data.head())

  label                                             review
0   neg  how do films like mouse hunt get into theatres...
1   neg  some talented actresses are blessed with a dem...
2   pos  this has been an extraordinary year for austra...
3   pos  according to hollywood movies made in last few...
4   neg  my first press screening of 1998 and already i...


In [36]:
sia = SentimentIntensityAnalyzer()

In [39]:
def get_sentiment(text):
    score = sia.polarity_scores(text)

    if score['compound'] >= 0:
        return "pos"
    else:
        return "neg"

data["predicted_sentiment"] = data["review"].apply(get_sentiment)

In [40]:
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(data["label"], data["predicted_sentiment"])

print("VADER Sentiment Analysis Accuracy:", accuracy)
print("\nClassification Report:\n")
print(classification_report(data["label"], data["predicted_sentiment"]))

VADER Sentiment Analysis Accuracy: 0.6335877862595419

Classification Report:

              precision    recall  f1-score   support

         neg       0.72      0.43      0.54       983
         pos       0.60      0.83      0.69       982

    accuracy                           0.63      1965
   macro avg       0.66      0.63      0.62      1965
weighted avg       0.66      0.63      0.62      1965

