# Modified TF x IDF

**Goal:** Give a higher priority to keywords:
    
- TF: additional params are a dict of keyword_to_score: {"prediction": 7, "remain": 2, "stable": 8}
- IDF: compute IDF normally, then - score from TF

[See convo with nova](https://chat.ai.it.ufl.edu/s/7e3502c2-ae0d-481e-925f-b53b13e8c05e)

In [6]:
import os
import sys

notebook_dir = os.getcwd()
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

# import log_files
from data_processing import DataProcessing

In [2]:
document = """    Some examples of predictions in the PhraseBank dataset are \n
        1. According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing . \n
        2. According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .
        3. Its board of directors will propose a dividend of EUR0 .12 per share for 2010 , up from the EUR0 .08 per share paid in 2009 .
    Some examples of non-predictions in the P
        1. Net sales increased to EUR193 .3 m from EUR179 .9 m and pretax profit rose by 34.2 % to EUR43 .1 m. ( EUR1 = USD1 .4 )
        2. Net sales surged by 18.5 % to EUR167 .8 m. Teleste said that EUR20 .4 m , or 12.2 % , of the sales came from the acquisitions made in 2009 .
        3. STORA ENSO , NORSKE SKOG , M-REAL , UPM-KYMMENE Credit Suisse First Boston ( CFSB ) raised the fair value for shares in four of the largest Nordic forestry groups ."""

prediction_properties = DataProcessing.load_prediction_properties()

future_verbs = [
    "will",
    "shall",
    "would",
    "going",
    "might",
    "should",
    "could",
    "may",
    "must",
    "can"
]
query = f"""Can you identify the predictions from the documents? I define a prediction as: {prediction_properties}. Note that the documents should be future tense like {future_verbs}."""

In [3]:
import math
from collections import Counter

def modified_tf_idf(document, query, keyword_scores):
    # Tokenize the document and query
    document_terms = document.lower().split()
    query_terms = query.lower().split()
    
    # Calculate TF for each term in the document
    tf = Counter(document_terms)
    total_terms = len(document_terms)
    tf_normalized = {term: count / total_terms for term, count in tf.items()}
    
    # Calculate IDF for each term in the query
    idf = {}
    for term in query_terms:
        if term in document_terms:
            idf[term] = math.log(1 / 1)  # IDF is 0 for terms in the document
        else:
            idf[term] = math.log(1 / 0.000001)  # Avoid division by zero
    
    # Modify IDF for important keywords
    for term, score in keyword_scores.items():
        if term in idf:
            idf[term] -= score
    
    # Calculate modified TF-IDF score
    tf_idf_score = sum(tf_normalized.get(term, 0) * idf.get(term, 0) * keyword_scores.get(term, 1) for term in query_terms)
    
    return tf_idf_score

# Example usage
# document = "This is an example document about predictions."
# query = "prediction remain stable"
keyword_scores = {"prediction": 7, "remain": 5, "stable": 5, "rose": 8, "profit": 8}

# score = modified_tf_idf(query, query, keyword_scores)
# print(f"Modified TF-IDF Score: {score}")

In [4]:
import math
from collections import Counter

def modified_tf_idf(document, query, keyword_scores):
    # Tokenize the document and query
    document_terms = document.lower().split()
    query_terms = query.lower().split()
    
    # Calculate TF for each term in the document
    tf = Counter(document_terms)
    total_terms = len(document_terms)
    tf_normalized = {term: count / total_terms for term, count in tf.items()}
    
    # Calculate IDF for each term in the query
    idf = {}
    for term in query_terms:
        if term in document_terms:
            idf[term] = math.log(1 / 1)  # IDF is 0 for terms in the document
        else:
            idf[term] = math.log(1 / 0.000001)  # Avoid division by zero
    
    # Modify IDF for important keywords
    for term, score in keyword_scores.items():
        if term in idf:
            idf[term] -= score
    
    # Calculate modified TF-IDF score for each term
    tf_idf_scores = {term: tf_normalized.get(term, 0) * idf.get(term, 0) * keyword_scores.get(term, 1) for term in query_terms}
    
    return tf_idf_scores

tf_idf_scores = modified_tf_idf(document, query, keyword_scores)
print("Modified TF-IDF Scores:", tf_idf_scores)

Modified TF-IDF Scores: {'can': 0.0, 'you': 0.0, 'identify': 0.0, 'the': 0.0, 'predictions': 0.0, 'from': 0.0, 'documents?': 0.0, 'i': 0.0, 'define': 0.0, 'a': 0.0, 'prediction': 0.0, 'as:': 0.0, '<p>': 0.0, '=': 0.0, '(<p_s>,': 0.0, '<p_t>,': 0.0, '<p_d>,': 0.0, '<p_o>),': 0.0, 'where': 0.0, 'it': 0.0, 'consists': 0.0, 'of': 0.0, 'following': 0.0, 'four': 0.0, 'properties:': 0.0, '1.': 0.0, '<p_s>': 0.0, '-': 0.0, 'defined': 0.0, 'source': 0.0, 'entity': 0.0, 'that': 0.0, 'states': 0.0, 'characteristics:': 0.0, 'person': 0.0, 'with': 0.0, 'either:': 0.0, 'name': 0.0, 'only,': 0.0, 'profile': 0.0, 'geneder': 0.0, 'domain': 0.0, 'specific': 0.0, 'title': 0.0, 'only': 0.0, 'or': 0.0, 'any': 0.0, 'combination': 0.0, 'these.': 0.0, 'an': 0.0, 'associated': 0.0, 'organization': 0.0, 'named': 0.0, 'entity:': 0.0, 'person,': 0.0, 'part': 0.0, 'speech:': 0.0, 'noun': 0.0, 'examples:': 0.0, 'only:': 0.0, 'detravious': 0.0, '2.': 0.0, 'name:': 0.0, 'fittojesus': 0.0, '3.': 0.0, 'gender': 0.0, 'h

In [5]:
import math
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

def calculate_tf(document):
    # Tokenize the document
    document_terms = document.lower().split()
    
    # Calculate term frequency
    tf = Counter(document_terms)
    total_terms = len(document_terms)
    tf_normalized = {term: count / total_terms for term, count in tf.items()}
    
    return tf_normalized

def calculate_idf(documents):
    # Tokenize all documents
    all_terms = [doc.lower().split() for doc in documents]
    
    # Calculate document frequency for each term
    df = Counter(term for doc in all_terms for term in doc)
    total_documents = len(documents)
    
    # Calculate IDF for each term
    idf = {term: math.log(total_documents / (df[term] + 1)) for term in df}
    
    return idf

def calculate_tf_idf(document, idf):
    # Tokenize the document
    document_terms = document.lower().split()
    
    # Calculate TF for the document
    tf = calculate_tf(document)
    
    # Calculate TF-IDF score
    tf_idf = {term: tf[term] * idf.get(term, 0) for term in document_terms}
    
    return tf_idf

# Example usage
documents = [
    "This is an example document about predictions.",
    "Another document with different terms and predictions.",
    "Yet another document to calculate IDF."
]

# Calculate IDF
idf = calculate_idf(documents)

# Calculate TF-IDF for a specific document
# document = "This is an example document about predictions."
tf_idf_scores = calculate_tf_idf(document, idf)

print("TF-IDF Scores:", tf_idf_scores)

TF-IDF Scores: {'some': 0.0, 'examples': 0.0, 'of': 0.0, 'predictions': 0.0, 'in': 0.0, 'the': 0.0, 'phrasebank': 0.0, 'dataset': 0.0, 'are': 0.0, '1.': 0.0, 'according': 0.0, 'to': 0.013777940566782285, 'gran': 0.0, ',': 0.0, 'company': 0.0, 'has': 0.0, 'no': 0.0, 'plans': 0.0, 'move': 0.0, 'all': 0.0, 'production': 0.0, 'russia': 0.0, 'although': 0.0, 'that': 0.0, 'is': 0.003936554447652081, 'where': 0.0, 'growing': 0.0, '.': 0.0, '2.': 0.0, "'s": 0.0, 'updated': 0.0, 'strategy': 0.0, 'for': 0.0, 'years': 0.0, '2009-2012': 0.0, 'basware': 0.0, 'targets': 0.0, 'a': 0.0, 'long-term': 0.0, 'net': 0.0, 'sales': 0.0, 'growth': 0.0, 'range': 0.0, '20': 0.0, '%': 0.0, '-40': 0.0, 'with': 0.0019682772238260406, 'an': 0.0019682772238260406, 'operating': 0.0, 'profit': 0.0, 'margin': 0.0, '10': 0.0, '-20': 0.0, '3.': 0.0, 'its': 0.0, 'board': 0.0, 'directors': 0.0, 'will': 0.0, 'propose': 0.0, 'dividend': 0.0, 'eur0': 0.0, '.12': 0.0, 'per': 0.0, 'share': 0.0, '2010': 0.0, 'up': 0.0, 'from': 0