In [1]:
import pandas as pd

In [55]:
# combine csvs
def combine_csvs():
    # read allfiles from the folder
    import glob
    file_list = glob.glob("csvs/*.csv")
    combined_df = pd.concat([pd.read_csv(file) for file in file_list], ignore_index=True)
    return combined_df

df = combine_csvs()
len(df)

5558

In [60]:
# First try ISO format
iso_dates = pd.to_datetime(df['Date'], format='%Y-%m-%d', errors='coerce')

# Then try day-first format
dmy_dates = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')

# Combine them: use ISO if available, else DMY
df['Date'] = iso_dates.fillna(dmy_dates).dt.date

# # Sort
df = df.sort_values(by='Date', ascending=False)
df.head()
df

Unnamed: 0,Title,Content,Date
5509,Peterborough hold talks with Williams over man...,Latest from Sky Sports News' Dharmesh Sheth an...,2025-10-26
5510,"Emery: Elliott has to prove, that is why I lef...",Aston Villa boss has said Harvey Elliott needs...,2025-10-26
5511,'West Ham to stick with Nuno',West Ham intend to stick with Nuno and are tar...,2025-10-25
5512,Malacia travels with squad to face Brighton,Tyrell Malacia is part of the match-day squad ...,2025-10-25
5513,Moyes coy on Toney links,David Moyes says he is focused on improving Ev...,2025-10-25
...,...,...,...
5477,Future England No 1? U21 Euros winner Beadle's...,Sky Sports,2025-07-01
5476,Ward returns to Wrexham,Wrexham have announced the signing of Wales go...,2025-07-01
5475,Villa announce signing of Dutch teenager Redmond,Aston Villa have announced the signing of teen...,2025-07-01
5498,Are Liverpool getting closer to Guehi deal?,'Advanced talks' are taking place betwen Liver...,2025-07-01


In [None]:
# def change_date_format(x):
#     # check if x is in format yyyy-mm-dd)
#     if isinstance(x, str) and len(x.split('-')) == 3 and len(x.split('-')[0]) == 4:
#        return pd.to_datetime(x, errors='coerce', dayfirst=False).date()
#     elif isinstance(x, str) and len(x.split('-')) == 3 and len(x.split('/')[2]) == 4:
#        return pd.to_datetime(x, errors='coerce', dayfirst=True).date()
#     else:
#         return pd.to_datetime(x, errors='coerce', dayfirst=True).date()
    
# df['Date'] = df['Date'].apply(change_date_format)
# df = df.sort_values(by='Date', ascending=False)

In [None]:
# 1. Remove empty titles
df.dropna(subset=['Title'], inplace= True)

# 2. Remove empty entries in the content column
df.dropna(subset=['Content'], inplace= True)



In [97]:
import numpy as np
ex_df = np.array([['apple', 'fruit',1], 
                   ['banana', 'fruit',2], 
                   ['apple', 'fruit',3], 
                   ['broccoli', 'vegetable',4], 
                   ['grape', 'fruit',5], 
                   ['grape', 'fruit',6]])
ex_df = pd.DataFrame(ex_df, columns=['Name', 'Type', 'Value'])

In [110]:
ex_df


Unnamed: 0,Name,Type,Value
0,apple,fruit,1
1,banana,fruit,2
2,apple,fruit,3
3,broccoli,vegetable,4
4,grape,fruit,5
5,grape,fruit,6


In [None]:
name_counts = ex_df['Name'].value_counts()
name_counts = name_counts[name_counts > 1].index
ex_df_new =ex_df[ex_df['Name'].isin(name_counts)]
ex_df_new

Unnamed: 0,Name,Type,Value
0,apple,fruit,1
2,apple,fruit,3
4,grape,fruit,5
5,grape,fruit,6


In [117]:
# 3. Remove titles with value count greater than 5
title_counts = df['Title'].value_counts()
filter_titles = title_counts[title_counts < 5].index
df = df[df['Title'].isin(filter_titles)]

# 4. Remove the content count greater than 5
content_counts = df['Content'].value_counts()
filter_content = content_counts[content_counts < 5].index
df = df[df['Content'].isin(filter_content)]

Sky Sports Essential EFL podcast: Championship Predictions

In [None]:
# 4. Remove Title with Noise Keywords
noise_title_words = ['Transfer 360:', 'FREE STREAM:', 'WATCH:','Deadline Day countdown!',
                     'FREE STREAM:', 'Sky Sports Essential EFL podcast:', 'VOTE:', 'LISTEN:',
                     'Sky Sports Fantasy Podcast:', 'Watch ALL', 'Podcast:', 'Next Up:', 'The EFL is back!', 'PL Chief',
                     'Next Up:', 'The EFL is back!', 'Analysis:', 'Listen to the', 'Papers:', 'The final vote!',
                     'Neville:', 'Carra:','Carragher:','Keane:' 'Sign up to']

# 'phrase1|phrase2|phrase3' (which means "OR")
noise_pattern = '|'.join(noise_title_words)
df = df[~df['Title'].str.contains(noise_pattern, case=False)]

# remove the entries when the content length is less than 150
df = df[df['Content'].str.len() >= 150]

In [156]:
df.groupby(['Title', 'Content']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)

Unnamed: 0,Title,Content,counts
479,"Arsenal's Madueke move explained, Eze interest...",Arsenal are aiming to sign attacking reinforce...,4
1378,Future England No 1? New Birmingham signing Be...,James Beadle was in goal for Lee Carsley's Eng...,4
1513,Has Frank stamped the Spursy out of Spurs alre...,Thomas Frank is living up to his name as a shr...,4
1585,How the PFA's pre-season programme is helping ...,Sky Sports visits the PFA's pre-season camp in...,4
2419,New forwards with the power to shape this seas...,With Premier League clubs making major upgrade...,4
...,...,...,...
22,'Bayern reach verbal agreement with Jackson',Bayern Munich have reached a verbal agreement ...,1
23,'Bizarre situation unfolding at Forest',"Sky Sports News' Rob Dorsett: ""It's bizarre. I...",1
24,'Bournemouth in advanced talks over Petrovic m...,Bournemouth are in advanced talks to sign Chel...,1
79,'Garnacho sets sights on Chelsea move',Manchester United's Alejandro Garnacho has set...,1


In [175]:
# Remove duplicates rumors
df_unique = df.drop_duplicates(subset=['Title', 'Content'])


In [172]:
df_unique.head()

Unnamed: 0,Title,Content,Date
5509,Peterborough hold talks with Williams over man...,Latest from Sky Sports News' Dharmesh Sheth an...,2025-10-26
5510,"Emery: Elliott has to prove, that is why I lef...",Aston Villa boss has said Harvey Elliott needs...,2025-10-26
5511,'West Ham to stick with Nuno',West Ham intend to stick with Nuno and are tar...,2025-10-25
5512,Malacia travels with squad to face Brighton,Tyrell Malacia is part of the match-day squad ...,2025-10-25
5513,Moyes coy on Toney links,David Moyes says he is focused on improving Ev...,2025-10-25


In [None]:
# To give more context to the model to eliminate the older rumors affects
df_unique.loc[:, 'Content'] = 'Date: ' + df_unique['Date'].astype(str) + ', Content: ' + df_unique['Content']

In [180]:
df_unique.head()

Unnamed: 0,Title,Content,Date
5509,Peterborough hold talks with Williams over man...,"Date: 2025-10-26, Content: Date: 2025-10-26, C...",2025-10-26
5510,"Emery: Elliott has to prove, that is why I lef...","Date: 2025-10-26, Content: Date: 2025-10-26, C...",2025-10-26
5511,'West Ham to stick with Nuno',"Date: 2025-10-25, Content: Date: 2025-10-25, C...",2025-10-25
5512,Malacia travels with squad to face Brighton,"Date: 2025-10-25, Content: Date: 2025-10-25, C...",2025-10-25
5513,Moyes coy on Toney links,"Date: 2025-10-25, Content: Date: 2025-10-25, C...",2025-10-25
