In [51]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd

# Load model

In [78]:
# https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# Load data clean

In [65]:
# Load the CSV file into a DataFrame
df_de = pd.read_csv('../Preprocessing/data_clean/de_clean_1.csv')
df_en = pd.read_csv('../Preprocessing/data_clean/en_clean_1.csv')
df_es = pd.read_csv('../Preprocessing/data_clean/es_clean_1.csv')

### DE fix

In [66]:
#for DE datasate drop empty lines
df_de = df_de.dropna(subset=['data'])


### Hincapié fix

In [67]:
# Replace 'Piero Hincapié' with 'Piero Hincapie' in the 'player' column
df_es['player'] = df_es['player'].replace('Piero Hincapié', 'Piero Hincapie')
df_en['player'] = df_en['player'].replace('piero hincapié', 'piero hincapie')


# Sentiment Analysis

In [70]:
def sentiment_score(data):
    if isinstance(data, str):
        tokens = tokenizer.encode(data, return_tensors='pt')
        result = model(tokens)
        return int(torch.argmax(result.logits)) + 1
    else:
        return 0

In [71]:
df_de['sentiment'] = df_de['data'].apply(lambda x: sentiment_score(x[:512]))
df_en['sentiment'] = df_en['data'].apply(lambda x: sentiment_score(x[:512]))
df_es['sentiment'] = df_es['data'].apply(lambda x: sentiment_score(x[:512]))

In [72]:
# Grouping the data by 'player' and calculating the mean of 'sentiment' for each player
mean_sentiment_de = df_de.groupby('player')['sentiment'].mean()
mean_sentiment_en = df_en.groupby('player')['sentiment'].mean()
mean_sentiment_es = df_es.groupby('player')['sentiment'].mean()

print('Sentiment score per player DE')
print(mean_sentiment_de)
print('Sentiment score per player EN')
print(mean_sentiment_en)
print('Sentiment score per player ES')
print(mean_sentiment_es)

Sentiment score per player DE
player
exequiel palacios    1.608247
jeremie frimpong     1.811966
jonathan tah         1.784091
mitchel bakker       1.677083
moussa diaby         1.760331
mykhaylo mudryk      1.000000
piero hincapie       1.623529
Name: sentiment, dtype: float64
Sentiment score per player EN
player
exequiel palacios    2.958333
jeremie frimpong     2.666667
jonathan tah         3.142857
mitchel bakker       2.777778
moussa diaby         2.698925
mykhaylo mudryk      2.692308
piero hincapie       2.260870
Name: sentiment, dtype: float64
Sentiment score per player ES
player
Exequiel Palacios    2.927835
Jeremie Frimpong     2.956522
Jonathan Tah         2.833333
Mitchel Bakker       2.818182
Moussa Diaby         2.548387
Mykhaylo Mudryk      1.962963
Piero Hincapie       3.217949
Name: sentiment, dtype: float64


# Player sentence analysis

In [73]:
def extract_sentence(df):
    df['short'] = ''
    
    for index, row in df.iterrows():
        player = row['player']
        data = str(row['data'])  # Convert data to string
        
        # Find the sentence containing player's information
        sentences = data.split('.')
        for sentence in sentences:
            if player.lower() in sentence.lower():
                df.at[index, 'short'] = sentence.strip()
                break
    
    return df

In [74]:
extract_sentence(df_de)
extract_sentence(df_en)
extract_sentence(df_es)

Unnamed: 0,data,player,language,publishedAt,sentiment,short
0,"Mercado de fichajes de fútbol, en directo Con...",Exequiel Palacios,es,2023-01-29T18:25:03Z,2,"Mercado de fichajes de fútbol, en directo Con..."
1,La última semana de marzo es la primera ventan...,Exequiel Palacios,es,2023-01-30T16:52:46Z,2,"Además de Messi y Ángel Di María, máximos refe..."
2,Gustavo Puerta ya no jugará en el Bayer Leverk...,Exequiel Palacios,es,2023-01-31T20:41:38Z,2,"El club alemán, que siempre se ha caracterizad..."
3,"Alberto Fernández El presidente de la AFA, Cl...",Exequiel Palacios,es,2023-02-09T18:32:38Z,3,"Alberto Fernández El presidente de la AFA, Cl..."
4,Alberto Fernández También participaron los ot...,Exequiel Palacios,es,2023-02-12T21:13:55Z,4,Alberto Fernández También participaron los ot...
...,...,...,...,...,...,...
268,"Antes del minuto, el visitante ya había llegad...",Piero Hincapie,es,2023-05-11T21:13:48Z,1,
269,El Bayer Leverkusen jugó de visita contra la R...,Piero Hincapie,es,2023-05-11T20:56:21Z,3,
270,Un ecuatoriano volverá a semifinales de un tor...,Piero Hincapie,es,2023-05-11T18:30:25Z,4,
271,La Selección de Ecuador jugará sus dos últimos...,Piero Hincapie,es,2023-05-10T23:37:55Z,3,


In [75]:
df_de['sentiment2'] = df_de['short'].apply(lambda x: sentiment_score(x[:512]))
df_en['sentiment2'] = df_en['short'].apply(lambda x: sentiment_score(x[:512]))
df_es['sentiment2'] = df_es['short'].apply(lambda x: sentiment_score(x[:512]))

In [76]:
# Grouping the data by 'player' and calculating the mean of 'sentiment' for each player
mean_sentiment_de = df_de.groupby('player')['sentiment2'].mean()
mean_sentiment_en = df_en.groupby('player')['sentiment2'].mean()
mean_sentiment_es = df_es.groupby('player')['sentiment2'].mean()

print('Sentiment score per player DE')
print(mean_sentiment_de)
print('Sentiment score per player EN')
print(mean_sentiment_en)
print('Sentiment score per player ES')
print(mean_sentiment_es)

Sentiment score per player DE
player
exequiel palacios    1.896907
jeremie frimpong     2.350427
jonathan tah         2.363636
mitchel bakker       2.302083
moussa diaby         2.272727
mykhaylo mudryk      1.166667
piero hincapie       2.200000
Name: sentiment2, dtype: float64
Sentiment score per player EN
player
exequiel palacios    2.250000
jeremie frimpong     2.839506
jonathan tah         3.571429
mitchel bakker       2.333333
moussa diaby         3.075269
mykhaylo mudryk      2.865385
piero hincapie       3.000000
Name: sentiment2, dtype: float64
Sentiment score per player ES
player
Exequiel Palacios    3.226804
Jeremie Frimpong     3.173913
Jonathan Tah         3.000000
Mitchel Bakker       2.272727
Moussa Diaby         2.580645
Mykhaylo Mudryk      2.407407
Piero Hincapie       3.858974
Name: sentiment2, dtype: float64
