In [None]:
def preprocess_text(text):
    text = text.lower()     # Convert text to lowercase
    
    text = re.sub(r"[^\w\s]", "", text) # Remove punctuation and special characters
    
    words = nltk.word_tokenize(text)     # Tokenize the text into words

    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words]
    
    preprocessed_text = " ".join(words)
    
    return preprocessed_text

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics.pairwise import cosine_similarity

def assess_bbc_relevance(df):
    relevance_scores = []

    for index, row in df.iterrows():
        content = get_nba_content(row['url'])
        player_name = row['player']

        if content is None:
            content = row['title']

        statement = player_name + " will be the MVP"

        # Preprocess content and statement
        preprocessed_content = preprocess_text(content)
        preprocessed_statement = preprocess_text(statement)

        # Calculate TF-IDF scores
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform([preprocessed_content, preprocessed_statement])

        # Calculate cosine similarity between content and statement
        similarity_score = cosine_similarity(tfidf_matrix)[0, 1]

        # Normalize similarity score to range between 0 and 1
        normalized_score = similarity_score / tfidf_matrix[0].sum()

        relevance_scores.append(normalized_score)

    # Normalize relevance scores to range between 0 and 1
    scaler = MaxAbsScaler()
    normalized_scores = scaler.fit_transform(np.array(relevance_scores).reshape(-1, 1))
    df['relevance_score'] = normalized_scores

    return df
