# Install and Import dependencies

In [1]:
from transformers import pipeline
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

  from .autonotebook import tqdm as notebook_tqdm


# Instantiate Model

In [2]:
from pattern.en import sentiment

# Load data d1-1

In [3]:
# Load the CSV file into a DataFrame
df_en_1 = pd.read_csv('https://raw.githubusercontent.com/svisel22/SS23-BIPM-Analytics-Lab---Group-4-repository/main/Preprocessing/data_clean/en_clean_1-1.csv')

## Extracting the sentences

In [4]:
def extract_sentence(df):
    df['short'] = ''
    
    for index, row in df.iterrows():
        player = row['player']
        data = str(row['data'])  # Convert data to string
        
        # Find the sentence containing player's information
        sentences = data.split('.')
        for sentence in sentences:
            if player.lower() in sentence.lower():
                df.at[index, 'short'] = sentence.strip()
                break
    
    return df

In [5]:
extract_sentence(df_en_1)

Unnamed: 0,data,player,language,publishedAt,short
0,ten if you included the toe-poked volley to te...,palacios,en,2023-02-16T23:56:00Z,ten if you included the toe-poked volley to te...
1,bayerleverkusen took the lead again in the st ...,palacios,en,2023-02-23T20:50:50Z,bayerleverkusen took the lead again in the st ...
2,wissam ben yedder levelled straight away from ...,palacios,en,2023-02-23T20:53:59Z,wissam ben yedder levelled straight away from ...
3,"midfielders: leandro paredes (juventus), angel...",palacios,en,2023-03-03T16:40:46Z,"midfielders: leandro paredes (juventus), angel..."
4,midfielders: rodrigo de paul (atletico madrid)...,palacios,en,2023-03-03T18:17:37Z,midfielders: rodrigo de paul (atletico madrid)...
...,...,...,...,...,...
236,steidten is credited as having played a key ro...,hincapie,en,2023-05-16T12:22:18Z,steidten is credited as having played a key ro...
237,-santiago morales; source: marca / rts via afi...,hincapie,en,2023-04-27T04:57:02Z,-santiago morales; source: marca / rts via afi...
238,and though one middling game won't make his su...,hincapie,en,2023-03-24T15:24:08Z,and though one middling game won't make his su...
239,"palacios tries a through ball, but hincapie is...",hincapie,en,2023-03-19T20:03:28Z,"palacios tries a through ball, but hincapie is..."


# Load data condensed

In [6]:
# Load the CSV file into a DataFrame
df_de_con = pd.read_csv('https://raw.githubusercontent.com/svisel22/SS23-BIPM-Analytics-Lab---Group-4-repository/main/Preprocessing/data_clean/en_clean_condensed.csv')

In [7]:
df_de_con

Unnamed: 0,data,player,language,publishedAt
0,football wirtz s goal bayerleverkusen europale...,exequiel palacios,en,2023-02-16T23:56:00Z
1,xasoccer football europaleague play off secon...,exequiel palacios,en,2023-02-23T20:50:50Z
2,bayerleverkusen shakhtar donetsk union berli...,exequiel palacios,en,2023-02-23T20:53:59Z
3,buenos aires ap world cup winner argentina c...,exequiel palacios,en,2023-03-03T16:40:46Z
4,star edition change location copy personal non...,exequiel palacios,en,2023-03-03T16:42:19Z
...,...,...,...,...
389,tottenham identified form bayerleverkusen werd...,piero hincapie,en,2023-05-16T12:22:18Z
390,future course kendry couple weeks shy birthda...,piero hincapie,en,2023-04-27T04:57:02Z
391,man city s alex robertson makes debut aiden o ...,piero hincapie,en,2023-03-24T15:24:08Z
392,palacios scored penalties bayerleverkusen beat...,piero hincapie,en,2023-03-19T20:03:28Z


# Sentiment analysis on d1-1

## Using pipelin with pretrained sentiment analysis model from bert
https://huggingface.co/bert-base-uncased
This was trained on news data

In [8]:
#ACTION look up bert base uncased and formulate better comment above
sentiment_classifier = pipeline('sentiment-analysis', model='bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
# Apply sentiment analysis on the 'short' column  and store the sentiment in a new column "sentiment_bert"
df_en_1['sentiment_bert'] = df_en_1['short'].apply(lambda x: sentiment_classifier(x)[0]['score'])

# Print the updated dataframe
df_en_1.head()

Unnamed: 0,data,player,language,publishedAt,short,sentiment_bert
0,ten if you included the toe-poked volley to te...,palacios,en,2023-02-16T23:56:00Z,ten if you included the toe-poked volley to te...,0.682737
1,bayerleverkusen took the lead again in the st ...,palacios,en,2023-02-23T20:50:50Z,bayerleverkusen took the lead again in the st ...,0.671484
2,wissam ben yedder levelled straight away from ...,palacios,en,2023-02-23T20:53:59Z,wissam ben yedder levelled straight away from ...,0.695632
3,"midfielders: leandro paredes (juventus), angel...",palacios,en,2023-03-03T16:40:46Z,"midfielders: leandro paredes (juventus), angel...",0.729994
4,midfielders: rodrigo de paul (atletico madrid)...,palacios,en,2023-03-03T18:17:37Z,midfielders: rodrigo de paul (atletico madrid)...,0.721409


## Fine-tuning BERT

In [13]:
df_en_lab = pd.read_csv('https://raw.githubusercontent.com/svisel22/SS23-BIPM-Analytics-Lab---Group-4-repository/main/Preprocessing/data_clean/labeled-data/labeled-en_clean_1-1_not101010.csv')

In [14]:
df_lab = df_en_lab.copy()
df_lab.head()

Unnamed: 0,data,player,language,publishedAt,Label
0,ten if you included the toe-poked volley to te...,palacios,en,2023-02-16T23:56:00Z,
1,bayerleverkusen took the lead again in the st ...,palacios,en,2023-02-23T20:50:50Z,
2,wissam ben yedder levelled straight away from ...,palacios,en,2023-02-23T20:53:59Z,positiv
3,"midfielders: leandro paredes (juventus), angel...",palacios,en,2023-03-03T16:40:46Z,neutral
4,midfielders: rodrigo de paul (atletico madrid)...,palacios,en,2023-03-03T18:17:37Z,neutral


In [None]:
train_text, temp_text, train_labels, temp_labels = train_test_split(df_lab['data'], df['Label'])

## Using sentiment form pattern.en
https://stackabuse.com/python-for-nlp-introduction-to-the-pattern-library/

In [26]:
# Function to get sentiment polarity
def get_sentiment(text):
    sentiment_score = sentiment(text)[0]
    return sentiment_score


In [32]:
# Apply sentiment analysis to the "short" column and store the sentiment in a new column "sentiment_pattern"
df_en_1['sentiment_pattern'] = df_en_1['short'].apply(get_sentiment)

# Print the updated dataframe
df_en_1.head()

Unnamed: 0,data,player,language,publishedAt,short,sentiment_pattern,sentiment_nltk
0,ten if you included the toe-poked volley to te...,palacios,en,2023-02-16T23:56:00Z,ten if you included the toe-poked volley to te...,0.0,0.0
1,bayerleverkusen took the lead again in the st ...,palacios,en,2023-02-23T20:50:50Z,bayerleverkusen took the lead again in the st ...,-0.0516,-0.0516
2,wissam ben yedder levelled straight away from ...,palacios,en,2023-02-23T20:53:59Z,wissam ben yedder levelled straight away from ...,0.2263,0.2263
3,"midfielders: leandro paredes (juventus), angel...",palacios,en,2023-03-03T16:40:46Z,"midfielders: leandro paredes (juventus), angel...",0.0,0.0
4,midfielders: rodrigo de paul (atletico madrid)...,palacios,en,2023-03-03T18:17:37Z,midfielders: rodrigo de paul (atletico madrid)...,0.0,0.0


## Using Sentiment Intensity Analyzer from nltk

In [28]:
# Download the VADER lexicon
nltk.download('vader_lexicon')

# Create an instance of the VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

# Function to get sentiment polarity
def get_sentiment(text):
    sentiment_scores = sid.polarity_scores(text)
    return sentiment_scores['compound']


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\joana\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [30]:

# Apply sentiment analysis to the "short" column and store the sentiment in a new column "sentiment_nltk"
df_en_1['sentiment_nltk'] = df_en_1['short'].apply(get_sentiment)

# Print the updated dataframe
df_en_1.head()


Unnamed: 0,data,player,language,publishedAt,short,sentiment_pattern,sentiment_nltk
0,ten if you included the toe-poked volley to te...,palacios,en,2023-02-16T23:56:00Z,ten if you included the toe-poked volley to te...,0.0,0.0
1,bayerleverkusen took the lead again in the st ...,palacios,en,2023-02-23T20:50:50Z,bayerleverkusen took the lead again in the st ...,0.208333,-0.0516
2,wissam ben yedder levelled straight away from ...,palacios,en,2023-02-23T20:53:59Z,wissam ben yedder levelled straight away from ...,0.1,0.2263
3,"midfielders: leandro paredes (juventus), angel...",palacios,en,2023-03-03T16:40:46Z,"midfielders: leandro paredes (juventus), angel...",0.0,0.0
4,midfielders: rodrigo de paul (atletico madrid)...,palacios,en,2023-03-03T18:17:37Z,midfielders: rodrigo de paul (atletico madrid)...,0.2,0.0


## third sentiment analysis model option, specifically for news: https://pypi.org/project/NewsSentiment/


# Sentiment analysis on data condensed