In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
import string
import pickle

# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load dataset
data = pd.read_csv('DataNeuron_Text_Similarity.csv')

# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into a string
    text = ' '.join(tokens)
    return text

# Preprocess text in dataset
data['text1'] = data['text1'].apply(preprocess_text)
data['text2'] = data['text2'].apply(preprocess_text)

# Vectorize text data
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['text1'] + ' ' + data['text2'])

# Calculate cosine similarity
similarity_scores = cosine_similarity(tfidf_matrix)

# Save vectorizer and similarity scores
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('similarity_scores.pkl', 'wb') as f:
    pickle.dump(similarity_scores, f)

# Load vectorizer and similarity scores
with open('vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

with open('similarity_scores.pkl', 'rb') as f:
    similarity_scores = pickle.load(f)

# Function to predict similarity score
def predict_similarity(text1, text2):
    # Preprocess text data
    text = preprocess_text(text1) + ' ' + preprocess_text(text2)
    vectorized_text = vectorizer.transform([text])
    
    # Calculate cosine similarity
    similarity_score = cosine_similarity(vectorized_text, tfidf_matrix)[0]
    return similarity_score

# Example usage
text1 = "broadband challenges tv viewing the number of ..."
text2 = "gardener wins double in glasgow britain s jaso..."
similarity_score = predict_similarity(text1, text2)
print("Similarity score:", similarity_score)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\poppo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Similarity score: [0.341886   0.01037452 0.003107   ... 0.         0.01313065 0.        ]
