In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

corpus1 = "Real Madrid Club de Fútbol, commonly referred to as Real Madrid, is a professional football club based in Madrid, Spain. Founded on 6 March 1902, Real Madrid is one of the most successful football clubs in the world."

corpus2 = "Real Madrid has won numerous domestic and international titles, including 34 La Liga titles, 19 Copa del Rey titles, and 13 UEFA Champions League titles. The club has a significant rivalry with Barcelona, known as El Clásico."


vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([corpus1, corpus2])


cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
print("Cosine Similarity:", cosine_sim[0][0])


Cosine Similarity: 0.12562441184401718


In [None]:
def jaccard_similarity(doc1, doc2):
    words_doc1 = set(doc1.lower().split())
    words_doc2 = set(doc2.lower().split())
    intersection = len(words_doc1.intersection(words_doc2))
    union = len(words_doc1.union(words_doc2))
    return intersection / union


corpus1 = "Real Madrid Club de Fútbol, commonly referred to as Real Madrid, is a professional football club based in Madrid, Spain. Founded on 6 March 1902, Real Madrid is one of the most successful football clubs in the world."

corpus2 = "Real Madrid has won numerous domestic and international titles, including 34 La Liga titles, 19 Copa del Rey titles, and 13 UEFA Champions League titles. The club has a significant rivalry with Barcelona, known as El Clásico."


jaccard_sim = jaccard_similarity(corpus1, corpus2)

print("Jaccard Similarity:", jaccard_sim)


Jaccard Similarity: 0.10714285714285714


In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D
from tensorflow.keras.layers import Embedding

In [None]:
data = pd.read_csv("/content/Tweets (1).csv")

In [None]:
data.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [None]:
data.isnull().sum()

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

In [None]:
data = data.dropna()

In [None]:
sentiment_label = data.sentiment.factorize()
sentiment_label

(array([0, 1, 1, ..., 2, 2, 0]),
 Index(['neutral', 'negative', 'positive'], dtype='object'))

In [None]:
tweet = data.text.values
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(tweet)
vocab_size = len(tokenizer.word_index) + 1
encoded_docs = tokenizer.texts_to_sequences(tweet)
padded_sequence = pad_sequences(encoded_docs, maxlen=200)

In [None]:
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length, input_length=200) )
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 200, 32)           851168    
                                                                 
 spatial_dropout1d_1 (Spati  (None, 200, 32)           0         
 alDropout1D)                                                    
                                                                 
 lstm_8 (LSTM)               (None, 50)                16600     
                                                                 
 dropout_2 (Dropout)         (None, 50)                0         
                                                                 
 dense_9 (Dense)             (None, 1)                 51        
                                                                 
Total params: 867819 (3.31 MB)
Trainable params: 867819 (3.31 MB)
Non-trainable params: 0 (0.00 Byte)
_________________

In [None]:
history = model.fit(padded_sequence,sentiment_label[0],validation_split=0.2, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
155/687 [=====>........................] - ETA: 2:01 - loss: -17.0792 - accuracy: 0.4196

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout, Embedding
vocab_size = len(tokenizer.word_index) + 1
embedding_vector_length = 32
maxlen = 200

# Define the RNN model
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length, input_length=maxlen))
model.add(SimpleRNN(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print model summary
print(model.summary())

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 200, 32)           851168    
                                                                 
 simple_rnn (SimpleRNN)      (None, 50)                4150      
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                                 
 dense_8 (Dense)             (None, 1)                 51        
                                                                 
Total params: 855369 (3.26 MB)
Trainable params: 855369 (3.26 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [None]:
history = model.fit(padded_sequence, sentiment_label[0], validation_split=0.2, epochs=5, batch_size=32)

loss, accuracy = model.evaluate(padded_sequence, sentiment_label[0])
print('Accuracy:', accuracy)
print('Loss:', loss)

Epoch 1/5
 93/687 [===>..........................] - ETA: 43s - loss: 0.6946 - accuracy: 0.3300

KeyboardInterrupt: 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['sentiment'], test_size=0.2, random_state=42)

# Vectorize the text data using CountVectorizer
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Initialize and train the Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_counts, y_train)

# Predict the sentiment labels for the test data
y_pred = nb_classifier.predict(X_test_counts)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6470160116448326
