In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score

In [None]:
txt_filename = 'plagiarism_data.txt'  # your uploaded txt file
csv_filename = 'plagiarism_data.csv'  # we'll save it as this

In [None]:
data = []
with open(txt_filename, 'r') as f:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) == 3:
            data.append(parts)

In [None]:
# Save as CSV
df = pd.DataFrame(data, columns=["sentence1", "sentence2", "label"])
df.to_csv(csv_filename, index=False)
print(f"✅ Converted '{txt_filename}' to '{csv_filename}'")

✅ Converted 'plagiarism_data.txt' to 'plagiarism_data.csv'


In [None]:
df = pd.read_csv(csv_filename)
df['label'] = df['label'].astype(int)  # make sure label is int


In [None]:
def jaccard_similarity(s1, s2):
    set1 = set(str(s1).lower().split())
    set2 = set(str(s2).lower().split())
    intersection = set1 & set2
    union = set1 | set2
    return len(intersection) / len(union) if union else 0

In [None]:
df['jaccard_score'] = df.apply(lambda row: jaccard_similarity(row['sentence1'], row['sentence2']), axis=1)
df['jaccard_pred'] = df['jaccard_score'].apply(lambda x: 1 if x >= 0.3 else 0)  # threshold = 0.3


In [None]:
# Drop any rows with NaNs
df = df.dropna(subset=["sentence1", "sentence2", "label"])

# Rebuild TF-IDF matrix
all_sentences = pd.concat([df['sentence1'], df['sentence2']], ignore_index=True)
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(all_sentences)

In [None]:
cosine_preds = []
for i in range(len(df)):
    vec1 = tfidf_matrix[i]
    vec2 = tfidf_matrix[i + len(df)]
    score = cosine_similarity(vec1, vec2)[0][0]
    cosine_preds.append((score, 1 if score >= 0.5 else 0))  # threshold = 0.5

In [None]:
df['cosine_score'] = [score for score, pred in cosine_preds]
df['cosine_pred'] = [pred for score, pred in cosine_preds]

In [None]:
print("\n📊 Accuracy Scores:")
print("✅ Jaccard Accuracy:", accuracy_score(df['label'], df['jaccard_pred']))
print("✅ Cosine Accuracy:", accuracy_score(df['label'], df['cosine_pred']))


📊 Accuracy Scores:
✅ Jaccard Accuracy: 0.5795780264529669
✅ Cosine Accuracy: 0.5975599465387662


In [None]:
pd.set_option('display.max_colwidth', None)  # Show full sentence text

# ✅ Show a random sample of 10 rows from the results
print("\n🔍 Random Sample Prediction Table (10 rows):")
print(df[['sentence1', 'sentence2', 'label', 'jaccard_score', 'jaccard_pred', 'cosine_score', 'cosine_pred']].sample(10, random_state=42))


🔍 Random Sample Prediction Table (10 rows):
                                                                                                               sentence1  \
2113                                                                      A man in a blue shirt walking down the street.   
69090                                                         Several women wearing bright dresses are in a competition.   
4202                                                                                       People are gathered at a bar.   
194635         There is a man with a gray shirt on his speed boat who caught a big fish, and has started to cut it open.   
262438                          girl with pink mohawk sitting in front of computer accompanied by two other individuals.   
91518            a child and a young adult the adult is maybe demonstrating how to do something to teach the young child   
259137                                A businessman holds an umbrella over himself whil

In [None]:
df.to_csv("plagiarism_analysis_results.csv", index=False)
print("\n📁 Saved detailed results to 'plagiarism_analysis_results.csv'")


📁 Saved detailed results to 'plagiarism_analysis_results.csv'
