In [146]:
import pandas as pd
from google.cloud import bigquery
from datetime import datetime, timedelta
from dotenv import load_dotenv
import os
import json
from pandas import json_normalize
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

In [2]:
def get_raw_news_from_big_query(table='raw_news', project_id='tomastestproject-433206', dataset='testdb_1') -> pd.DataFrame:
    # Set the path to your service account JSON file

    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'tomastestproject-433206-adc5bc090976.json'

    table_id = f"{project_id}.{dataset}.{table}"
    # Create a BigQuery client
    client = bigquery.Client()

    # Build your SQL query
    query = f"""
        SELECT *
        FROM `{table_id}`
    """

    # Execute the SQL query
    query_job = client.query(query)

    # Fetch the results
    results = query_job.result()

    # Convert results to a DataFrame
    df = results.to_dataframe()

    # Check if DataFrame is empty and raise an error if needed
    if df.empty:
        raise ValueError(f"No data found")

    return df

In [85]:
df = get_raw_news_from_big_query()



In [175]:
def clean_news(data: pd.DataFrame) -> pd.DataFrame:

    # Förbered DataFrame
    # Se till att 'data' kolumnen är en lista av artiklar
    df['data'] = df['data'].apply(lambda x: x.get(
        'articles', []) if isinstance(x, dict) else [])

    # Explodera artiklar till separata rader
    df_exploded = df.explode('data')

    # Normalisera JSON-data i 'data' kolumnen
    articles_df = json_normalize(df_exploded['data'])

    # Lägg till övriga kolumner
    # Kombinera normaliserad artikeldata med 'company' kolumnen
    final_df = pd.concat(
        [articles_df, df_exploded[['company']].reset_index(drop=True)], axis=1)

    final_df.drop(columns=['content', 'source.id', 'urlToImage'], inplace=True)

    final_df['publishedAt'] = pd.to_datetime(
        final_df['publishedAt'], format='%Y-%m-%dT%H:%M:%SZ', utc=True)

    final_df.rename(columns={"source.name": "source_name",
                             "publishedAt": "pub_date"},
                    inplace=True
                    )
    return final_df

In [150]:
def make_score(string: str) -> float:
    """
    Predicts sentiment for a string. returns a float between -1 and 1.
    """
    sia = SentimentIntensityAnalyzer()
    if string is None:
        return None
    else:
        return sia.polarity_scores(string)['compound']


def predict_sentiment(df: pd.DataFrame):
    """
    Makes scores for each title and description and aggregates the score for each company for each pub date.
    Also adds date of modification as "fetch_date".
    """
    df['score_description'] = df['description'].apply(make_score)
    df['score_title'] = df['title'].apply(make_score)

In [176]:
df = get_raw_news_from_big_query()
clean_df = clean_news(df)



In [149]:
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/tomasrydenstam/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [177]:
predict_sentiment(clean_df)

In [178]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 786 entries, 0 to 785
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   author             776 non-null    object             
 1   description        780 non-null    object             
 2   pub_date           786 non-null    datetime64[ns, UTC]
 3   title              786 non-null    object             
 4   url                786 non-null    object             
 5   source_name        786 non-null    object             
 6   company            786 non-null    object             
 7   score_description  780 non-null    float64            
 8   score_title        786 non-null    float64            
dtypes: datetime64[ns, UTC](1), float64(2), object(6)
memory usage: 55.4+ KB


In [179]:
from google.cloud import bigquery

# Initiera BigQuery-klienten
client = bigquery.Client.from_service_account_json(
    'tomastestproject-433206-adc5bc090976.json'
)

# Definiera ditt dataset och tabellnamn
table = 'clean_news'
table_id = f"tomastestproject-433206.testdb_1.{table}"

# Definiera schema med uppdaterat kolumnnamn
schema = [
    bigquery.SchemaField("author", "STRING"),
    bigquery.SchemaField("description", "STRING"),
    bigquery.SchemaField("pub_date", "TIMESTAMP"),
    bigquery.SchemaField("title", "STRING"),
    bigquery.SchemaField("url", "STRING"),
    bigquery.SchemaField("source_name", "STRING"),  # Uppdaterat kolumnnamn
    bigquery.SchemaField("company", "STRING"),
    bigquery.SchemaField("score_description", "FLOAT"),
    bigquery.SchemaField("score_title", "FLOAT"),
]

# Skapa en Tabellreferens
table = bigquery.Table(table_id, schema=schema)

# Skapa Tabell
table = client.create_table(table)  # Här skapas tabellen med table-objektet
print(f"Created table {table_id}")

Created table tomastestproject-433206.testdb_1.clean_news


In [180]:


def write_clean_news_to_bq(data: pd.DataFrame, table='clean_news', project_id='tomastestproject-433206', dataset='testdb_1'):
    # Initiera BigQuery-klienten
    client = bigquery.Client.from_service_account_json(
        'tomastestproject-433206-adc5bc090976.json'
    )

    # Definiera fullständigt tabell-id
    table_id = f"{project_id}.{dataset}.{table}"

    # Ladda DataFrame till BigQuery
    job = client.load_table_from_dataframe(data, table_id)

    # Vänta tills jobbet är klart
    job.result()

    # Kontrollera om det blev fel vid insättning av rader
    if job.errors:
        print(f"Errors: {job.errors}")
    else:
        print("DataFrame har sparats till BigQuery utan fel.")

In [170]:
clean_df.columns

Index(['author', 'description', 'publishedAt', 'title', 'url', 'source.name',
       'company', 'score_description', 'score_title'],
      dtype='object')

In [181]:
write_clean_news_to_bq(clean_df)

DataFrame har sparats till BigQuery utan fel.


### FORTSÄTT HÄR ###

In [14]:
from google.cloud import bigquery
import pandas as pd  # Importera pandas för att hantera DataFrames

# Initialisera BigQuery-klienten
client = bigquery.Client.from_service_account_json(
    'tomastestproject-433206-adc5bc090976.json')

# SQL-fråga för att hämta data från tabellen
query = """ 
SELECT unique_id, data 
FROM `tomastestproject-433206.testdb_1.raw_news_with_uuid`
WHERE is_processed IS FALSE
"""

# Kör frågan
job = client.query(query)

# Vänta på att jobbet ska slutföras och hämta resultaten som en DataFrame
df = job.to_dataframe()

# Visa de första raderna i DataFrame
print(df.head())



                              unique_id  \
0  142a1c6c-4943-452a-8d09-46d3d61007bf   
1  ec82340a-3a50-4e7d-af11-2f78b2878dbc   
2  1983a15a-e23b-481c-98d6-0785ae1358e3   
3  7ec7b81a-a5d6-4ef8-8d8e-fe68b212870c   
4  de325a15-d2ca-4703-b459-7824fb59dada   

                                                data  
0  {'articles': [{'author': 'Trefis Team, Contrib...  
1  {'articles': [{'author': 'Trefis Team, Contrib...  
2  {'articles': [{'author': 'Trefis Team, Contrib...  
3  {'articles': [{'author': 'Trefis Team, Contrib...  
4  {'articles': [{'author': None, 'content': 'Pos...  


In [15]:
processed_id_list = df["unique_id"].to_list()
id_str = ', '.join(f"'{id}'" for id in processed_id_list)

In [17]:
# Konvertera listan till en SQL-kompatibel sträng
id_str = ', '.join(f"'{id}'" for id in processed_id_list)

# Konstruera SQL-frågan
query = f"""
UPDATE `tomastestproject-433206.testdb_1.raw_news_with_uuid`
SET is_processed = TRUE
WHERE unique_id IN ({id_str});
"""

# Kör frågan
job = client.query(query)
job.result()  # Vänta på att jobbet ska slutföras

print("Tabellen har uppdaterats med is_processed = TRUE för angivna ID:n.")

Tabellen har uppdaterats med is_processed = TRUE för angivna ID:n.
