In [3]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.metrics import accuracy_score
from scipy.spatial.distance import cosine

df = pd.read_csv("dataset.txt", delimiter="\t", header=None, names=["text1", "text2", "label"])
df.fillna("", inplace=True)
df['label'] = df['label'].astype(int)

def fast_preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return " ".join(text.split())  

df['processed_text1'] = df['text1'].apply(fast_preprocess)
df['processed_text2'] = df['text2'].apply(fast_preprocess)

vectorizer = HashingVectorizer(n_features=100000, alternate_sign=False)

vec1 = vectorizer.transform(df['processed_text1'])
vec2 = vectorizer.transform(df['processed_text2'])

similarity_scores = np.array([
    1 - cosine(v1.toarray().flatten(), v2.toarray().flatten()) 
    for v1, v2 in zip(vec1, vec2)
])

df['similarity_score'] = similarity_scores

optimal_threshold = np.percentile(similarity_scores, 50)
df['predicted_label'] = (df['similarity_score'] >= optimal_threshold).astype(int)

accuracy = accuracy_score(df['label'], df['predicted_label'])
print(f"Optimized Model Accuracy: {accuracy:.4f}")

text1 = input()
text2 = input()
processed_text1 = fast_preprocess(text1)
processed_text2 = fast_preprocess(text2)

vec_texts = vectorizer.transform([processed_text1, processed_text2])
similarity = 1 - cosine(vec_texts[0].toarray().flatten(), vec_texts[1].toarray().flatten())

print(f"Similarity Score: {similarity:.4f}")
print("Prediction:", "Paraphrased" if similarity >= optimal_threshold else "Not Paraphrased")


  dist = 1.0 - uv / np.sqrt(uu * vv)


Optimized Model Accuracy: 0.6292
Similarity Score: 0.4743
Prediction: Paraphrased
