In [1]:
try:
    from sentence_transformers import SentenceTransformer, util
except:
    !pip install sentence-transformers
try:
    import pandas as pd
except:
    !pip install pandas
try:
    import pytz
except:
    !pip install pytz
    
from datetime import datetime

utc = pytz.UTC # Define timezone-aware start and end dates

In [2]:
# Load pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Compact and efficient

In [3]:
# Load the dataset from a JSON Lines file
data = pd.read_json("signal-1m-vw_volkswagen.jsonl", lines=True)  
data_org = data
data.head()

Unnamed: 0,id,content,title,media-type,source,published
0,79476059-1bf0-4231-b561-6778680c3b7a,Volkswagen Passenger Cars and Volkswagen Comme...,The new Volkswagen Touran,Blog,Latest News on One News Page [United States] -...,2015-07-20T06:40:20Z
1,a109927d-b9ae-43f2-9ba1-3b0d2557d956,Extraordinary Inventions: Victorian-Era Prank ...,Archive: 2012,Blog,Dark Roasted Blend,2015-08-04T02:21:58Z
2,ddd3c909-ee01-46de-aae1-b8b471e97fd4,(0 comments - 370 views) \n#1 Video Ronda Rous...,Weekly Achievements for 02Aug15 thru 08Aug15,Blog,Latest Blog Entries at VideoSift.com,2015-08-09T07:01:02Z
3,fc8ad5c5-b6cc-4d42-a01f-52c9c6f6e47e,(0 comments - 323 views) \n#1 Video Ronda Rous...,Weekly Achievements for 02Aug15 thru 08Aug15,Blog,Latest Blog Entries at VideoSift.com,2015-08-09T07:01:02Z
4,290bd144-bf98-4d3d-bc85-b3a423ca7e8a,(0 comments - 434 views) \n#1 Video Ronda Rous...,Weekly Achievements for 02Aug15 thru 08Aug15,Blog,Latest Blog Entries at VideoSift.com,2015-08-09T07:01:02Z


In [4]:
# Cleaning data
data['content'] = data['content'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
data['title'] = data['title'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
data['source'] = data['source'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
data['published'] = pd.to_datetime(data['published'])

In [5]:
data.head()

Unnamed: 0,id,content,title,media-type,source,published
0,79476059-1bf0-4231-b561-6778680c3b7a,volkswagen passenger cars and volkswagen comme...,the new volkswagen touran,Blog,latest news on one news page united states to...,2015-07-20 06:40:20+00:00
1,a109927d-b9ae-43f2-9ba1-3b0d2557d956,extraordinary inventions victorianera prank ma...,archive 2012,Blog,dark roasted blend,2015-08-04 02:21:58+00:00
2,ddd3c909-ee01-46de-aae1-b8b471e97fd4,0 comments 370 views \n1 video ronda rousey o...,weekly achievements for 02aug15 thru 08aug15,Blog,latest blog entries at videosiftcom,2015-08-09 07:01:02+00:00
3,fc8ad5c5-b6cc-4d42-a01f-52c9c6f6e47e,0 comments 323 views \n1 video ronda rousey o...,weekly achievements for 02aug15 thru 08aug15,Blog,latest blog entries at videosiftcom,2015-08-09 07:01:02+00:00
4,290bd144-bf98-4d3d-bc85-b3a423ca7e8a,0 comments 434 views \n1 video ronda rousey o...,weekly achievements for 02aug15 thru 08aug15,Blog,latest blog entries at videosiftcom,2015-08-09 07:01:02+00:00


In [6]:
# Load SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define reference query
reference_query = "Volkswagen"

# Generate embeddings for articles and the query
data['combined_text'] = data['title'] + ". " + data['content']
article_embeddings = model.encode(data['combined_text'].tolist(), convert_to_tensor=True)  # Ensure input is a list
query_embedding = model.encode(reference_query.lower(), convert_to_tensor=True)

# Compute cosine similarity
data['similarity'] = util.cos_sim(query_embedding, article_embeddings).squeeze().cpu().numpy()

# Filter and sort by relevance
relevant_articles = data[data['similarity'] > 0.5].sort_values(by='similarity', ascending=False)

# Display results
relevant_articles[['id', 'title', 'content', 'published', 'similarity']]

Unnamed: 0,id,title,content,published,similarity
1233,e8910229-4c4c-40b5-9450-ab970959f2cf,volkswagen dealership near manassas virginia i...,karen radley volkswagen offers new 2016 volksw...,2015-09-18 12:36:00+00:00,0.605165
3917,1c7b808b-c61b-4def-a39a-20cc2d77717a,vw cars with suspect software in europe too,geir\tmoulsontheassociatedpress \r\n\r\n\t\tby...,2015-09-24 14:21:00+00:00,0.583258
6105,ad9edf0a-e667-4fd1-ae23-e7b5fa4088e7,volkswagen has started damage control,volkswagen ag continues to be the focal point ...,2015-09-29 14:13:00+00:00,0.582303
6098,becc897d-05ff-4c20-9a18-76a1db2b75c1,vw reaches out to diesel consumers with new we...,dr herbert diess ceo of the volkswagen passeng...,2015-09-29 14:01:00+00:00,0.579154
5940,ea289542-b8cb-4f6e-b2fd-2d0cea16da77,volkswagen 1200,tuesday 29 september 2015\nno need to comment ...,2015-09-29 07:45:46+00:00,0.578764
...,...,...,...,...,...
1667,76b15a52-6e18-4600-a4a6-913d62379a3b,vw shares plunge after car maker admits riggin...,shares in german car maker volkswagen have plu...,2015-09-21 11:50:38+00:00,0.501263
3804,3cf3c9bf-f3c9-4ecc-844a-9d9360812e2a,update 1vw customers dealers seek speedy actio...,trust in cars intact not in company dealer a...,2015-09-24 11:33:58+00:00,0.500833
2956,473adecc-d178-4d42-bc82-2d887eac2341,volkswagen shares are going nuts,volkswagen shares are getting hammered again o...,2015-09-23 07:20:42+00:00,0.500804
6563,59b71021-3955-4a48-aa61-7033ad874d0e,sorry folks volkswagen wont die,thanks to the magic of the market economy the ...,2015-09-30 15:41:45+00:00,0.500580


In [7]:
# Save relevant articles to a CSV file
output_csv_path = "relevant_articles.csv"

# Use the original 'title' and 'content' columns before cleaning
relevant_articles[['id', 'title', 'content', 'published', 'similarity']].to_csv(output_csv_path, index=False)

# Print original titles, content, and dates in sequence
for _, row in relevant_articles.sort_values(by='published', ascending=True).iterrows():
    print(f"Date: {row['published']}")
    print(f"Title: {data_org.loc[row.name, 'title']}")  # Access the original title
    print(f"Content: {data_org.loc[row.name, 'content']}")  # Access the original content
    print("-" * 80)

print(f"Relevant articles have been saved to {output_csv_path}.")


Date: 2015-07-20 06:40:20+00:00
Title: the new volkswagen touran
Content: volkswagen passenger cars and volkswagen commercial vehicles together offer one of the most diverse and attractive ranges of mpvs anywhere in the world the passenger car brand from wolfsburg has the golf sportsvan the touran and the sharan while volkswagen commercial vehicles in hanover 
--------------------------------------------------------------------------------
Date: 2015-09-01 16:28:27+00:00
Title: volkswagen sales climb as school year commences smart new 2016 jetta receives safety accolades
Content: ajax ontario  marketwired  090115  as school bells begin to ring out sales at volkswagen canada continue to set a torrid pace during august 6826 new cars and light trucks were sold by the company making it the best such month of all time yeartodate sales of 50418 are also well ahead of last years total 44786 

as we transition to the new model year we will introduce a whole slew of new technologies in our prod