In [1]:
import numpy as np
import pandas as pd

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import jaccard, correlation

# Read JSON file line by line
json_data = []
with open('mlq3.json', 'r') as file:
    for line in file:
        json_data.append(json.loads(line))

# Create DataFrame from the JSON data
df = pd.DataFrame(json_data)

# Display the DataFrame
print(df.head())

# Given data point
given_data = 'This is a sample data point'

# Concatenate 'headline' and 'short_description' columns into a single text column
df['text'] = df['headline'] + ' ' + df['short_description']

# Calculate cosine similarity
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['text'])
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf.transform([given_data])).flatten()

# Calculate Euclidean distance
euclidean_distances = euclidean_distances(tfidf_matrix, tfidf.transform([given_data])).flatten()

# Convert text data to binary vectors
binary_vectors = tfidf_matrix.sign()

# Calculate Jaccard similarity
jaccard_similarities = [1 - jaccard(binary_vectors[i].toarray().flatten(), tfidf.transform([given_data]).toarray().flatten()) for i in range(len(df))]

# Calculate Pearson correlation coefficient
pearson_correlations = [1 - correlation(tfidf_matrix[i].toarray().flatten(), tfidf.transform([given_data]).toarray().flatten()) for i in range(len(df))]

# Create a DataFrame to store the similarity scores
similarity_scores = pd.DataFrame({
    'Category': df['category'],
    'Cosine Similarity': cosine_similarities,
    'Euclidean Distance': euclidean_distances,
    'Jaccard Similarity': jaccard_similarities,
    'Pearson Correlation': pearson_correlations
})

# Sort the DataFrame by similarity scores in descending order
similarity_scores = similarity_scores.sort_values(by=['Cosine Similarity', 'Euclidean Distance', 'Jaccard Similarity', 'Pearson Correlation'], ascending=False)

# Print the most similar data point
most_similar_data = similarity_scores.iloc[0]
print('\nMost Similar Data:')
print(most_similar_data)

# Access specific information
most_similar_category = most_similar_data['Category']
most_similar_data_point = df.loc[df['category'] == most_similar_category, 'text'].values[0]
print('\nMost Similar Category:', most_similar_category)
print('Most Similar Data Point:', most_similar_data_point)
