In [1]:
!pip install nltk spacy gensim pandas scikit-learn #to install python packages using pip



In [2]:
import nltk #imports nltk library for natural language processing in python.
nltk.download('punkt') # Download the Punkt tokenizer
from nltk.tokenize import word_tokenize #this function is used to split sentences into words or tokens

sentence = "Hello, world! This is NLP." #input to tokenize
tokens = word_tokenize(sentence) #split the input sentence into individual tokens
print(tokens) # Output: ['Hello', ',', 'world', '!', 'This', 'is','NLP', '.'] #the list of tokens created by the tokenizer.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['Hello', ',', 'world', '!', 'This', 'is', 'NLP', '.']


In [3]:
from nltk.corpus import stopwords #imports stopwords module
nltk.download('stopwords') #download stopwords dataset
stop_words = set(stopwords.words('english')) #create set of english stopwords
filtered_tokens = [word for word in tokens if word.lower() not in
stop_words] # filter tokens that are found in the stopwords #ensure case-insensitive comparison
print(filtered_tokens) #prints list of tokens after stopwords have been filtered

['Hello', ',', 'world', '!', 'NLP', '.']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
from nltk.stem import PorterStemmer #stemming is the process of reducing a word to its base form.
from nltk.stem import WordNetLemmatizer #word is reduced to its meaningful base form
nltk.download('wordnet') #wordnet
ps = PorterStemmer() #creates an instance of the porter stemmer class
lemmatizer = WordNetLemmatizer() # creates an instance of the wordnetlemmatizer class
print(ps.stem("faster")) # Output: faster #because it is already reduced to its base form
print(lemmatizer.lemmatize("sung", pos='v')) # Output: sing (more context needed for lemmatization) #sing is the base form of sung # The accuracy of lemmatization can improve if the part of speech (POS) is provided

[nltk_data] Downloading package wordnet to /root/nltk_data...


faster
sing


In [5]:
import pandas as pd #used for data manipulation and analysis
import nltk #toolkit for nlp
from sklearn.model_selection import train_test_split #split dataset into train and test set
from sklearn.feature_extraction.text import CountVectorizer #converts a collection of text documents into a matrix of token counts
from sklearn.naive_bayes import MultinomialNB #implements multinomial naive bayes algorithm
from sklearn import metrics #evaluates performance

In [7]:
data = {
 'text': [
 'I love this movie!',
 'This was a terrible movie.',
 'I really enjoyed the film.',
 'Worst experience ever.',
 'It was fantastic!',
 'Not worth the time.',
 'Absolutely amazing!',
 'It was okay, not great.',
 'I hate this film.',
 'Best movie ever!'
 ],
 'sentiment': [
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'neutral',
 'negative',
 'positive'
 ]
} #input data

In [8]:
df = pd.DataFrame(data)  #convert into tabular structure with rows and columns

In [9]:
print(df) #prints the data frame

                         text sentiment
0          I love this movie!  negative
1  This was a terrible movie.  positive
2  I really enjoyed the film.  negative
3      Worst experience ever.  positive
4           It was fantastic!  negative
5         Not worth the time.  positive
6         Absolutely amazing!  negative
7     It was okay, not great.   neutral
8           I hate this film.  negative
9            Best movie ever!  positive


In [10]:
X = df['text'] #contains input text data which is used for training and testing
y = df['sentiment'] #contains the target labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42) #splits the data into training(X_train(input), y_train(target_label)) and testing(X_test(input), y_test(target_label)) #20% is used for testing and remaining for testing
# Vectorize the text
vectorizer = CountVectorizer() # convert text data into token counts
X_train_vectorized = vectorizer.fit_transform(X_train) #transforms the training data into a matrix of word counts
X_test_vectorized = vectorizer.transform(X_test) #transforms the testing data into a amtrix of word counts


In [11]:
model = MultinomialNB() #create instance of MultinomialNB class
model.fit(X_train_vectorized, y_train) #trains the model using training data
#during training, model learns the relationship between wordcounts and targets, which is used to make predictions

In [12]:
y_pred = model.predict(X_test_vectorized) #to make predictions on the test set.

In [13]:
accuracy = metrics.accuracy_score(y_test, y_pred) #calculates accuracy of the model predictions
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)#helps us to know how well model is performing
print(f'Accuracy: {accuracy:.2f}') #prints accuracy
print('Confusion Matrix:') #prints heading
print(confusion_matrix) #prints confusion matrix

Accuracy: 0.50
Confusion Matrix:
[[1 0]
 [1 0]]


In [14]:
def predict_sentiment(text): #defines function with single argument
 text_vectorized = vectorizer.transform([text]) #transform train and test data
 prediction = model.predict(text_vectorized) #predict sentiment based on the input
 return prediction[0] #return element from array prediction
# Example usage
new_text = "I loved the plot and the acting!" #example input
print(f'Sentiment: {predict_sentiment(new_text)}') #predicts the sentiment as 'negative' for the given sentence.

Sentiment: negative
