In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
tqdm.pandas()

In [None]:
df = pd.read_csv(r"sentiment_scores_dates.csv")

In [None]:
df['date'] = df.progress_apply(lambda row: pd.to_datetime(row['datetime']).strftime('%d/%m/%Y'), axis=1)
df['date_month'] = df.progress_apply(lambda row: pd.to_datetime(row['datetime']).strftime('%d/%m'), axis=1)

In [None]:
df.to_csv('sentiment_scores_dates.csv', index=False)

### Analyzing distribution of scores

In [None]:
# invite people for the party
import matplotlib.pyplot as plt
import seaborn as sns
import locale
locale.setlocale(locale.LC_ALL, "pt_BR.utf8")
plt.rcParams.update({
    'axes.formatter.use_locale' : True,
})
plt.style.use('ggplot')

In [None]:
# https://www.fschuch.com/blog/2020/10/14/graficos-com-qualidade-de-publicacao-em-python-com-matplotlib/
# define a pattern of dimensions for plots
def get_figsize(columnwidth=4, wf=1.0, hf_rel=(5.0 ** 0.5 - 1.0) / 2.0, hf_abs=None, unit="inch"):
    # Dessa maneira, unit não será sensível a letras maiúsculas e minúsculas
    unit = unit.lower()

    # Converte unidades para polegadas, conforme esperado por Matplotlib
    conversion = dict(inch=1.0, mm=25.4, cm=2.54, pt=72.0,)

    if unit in conversion.keys():
        fig_width = columnwidth / conversion[unit]
        if hf_abs is not None:
            fig_height = hf_abs / conversion[unit]
    else:
        raise ValueError(f"unit deve ser: {conversion.keys()}")

    # A figura será apenas uma fração da largura útil da página
    fig_width *= wf

    # Caso hf_abs não seja definido, a altura será uma fração da largura
    if hf_abs is None:
        fig_height = fig_width * hf_rel

    # Retorna a largura e altura especificada para a figura
    return (fig_width, fig_height)

plt.rcParams.update({
    'figure.figsize' : get_figsize(columnwidth=455.0, unit='pt'),
    #
    "axes.labelsize": 12,
    "font.size": 12,
    "legend.fontsize": 12,
    "xtick.labelsize": 12,
    "ytick.labelsize": 12,
})

# if necessary a customization for each particular figure
# fig, axes = plt.subplots(figsize=get_figsize(columnwidth=16, unit='cm', hf_rel=0.5))

### Exemplo salvando arquivo em vetorial

In [None]:
# plt.savefig('example_line.'+f, format=f)

### Distribuição dos scores (Sem normalização)

In [None]:
# Criando o ambiente do gráfico 
sns.set_style("white")
fig, ax = plt.subplots(1, 1, figsize=(15, 10))

# Insere curva KDE (Kernel Density Estimation)
g1 = sns.distplot(df["textblob_score"], ax=ax, 
                  kde=True, hist=False, label='Textblob score')
# Insere curva KDE (Kernel Density Estimation)
g2 = sns.distplot(df["vader_score"], ax=ax, 
                  kde=True, hist=False, label='Vader score')
# Insere curva KDE (Kernel Density Estimation)
g3 = sns.distplot(df["afinn_score"], ax=ax, 
                  kde=True, hist=False, label='Afinn score')
plt.legend()
plt.savefig('1_scores_distribution.svg')
plt.savefig('1_scores_distribution.png')

### Normalização do score

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer

In [None]:
array = np.array(df['afinn_score']).reshape(-1, 1)

In [None]:
# squares_root
def squareRoot(value):
    if value > 0:
        return np.sqrt(value)
    if value < 0:
        return np.sqrt(abs(value)) * (-1)
    return 0

squares = df.apply(lambda row: squareRoot(row['afinn_score']), axis=1)

##### Normalização

In [None]:
df_afinn_score = df['afinn_score']

In [None]:
normalizado = MinMaxScaler(feature_range=(-1,1)).fit_transform(np.array(df_afinn_score).reshape(-1,1))

##### Gráfico com normalização 1

In [None]:
# Criando o ambiente do gráfico 
sns.set_style("white")
fig, ax = plt.subplots(1, 1, figsize=(15, 10))

# Insere curva KDE (Kernel Density Estimation)
g2 = sns.distplot(df["textblob_score"], ax=ax, 
                  kde=True, hist=False, label='Textblob score')
# Insere curva KDE (Kernel Density Estimation)
g3 = sns.distplot(df["vader_score"], ax=ax, 
                  kde=True, hist=False, label='Vader score')
# Insere curva KDE (Kernel Density Estimation)
g3 = sns.distplot(normalizado, ax=ax, 
                  kde=True, hist=False, label='Afinn score normalizado')
plt.legend()
plt.xlabel('Polaridade')
plt.xlabel('Densidade')
plt.savefig('2_scores_distribution.svg')
plt.savefig('2_scores_distribution.png')

##### Gráfico com raiz quadrada e normalização

In [None]:
normalizadoComRaiz1 = MinMaxScaler(feature_range=(-1,1)).fit_transform(squares.values.reshape(-1,1))

In [None]:
# Criando o ambiente do gráfico 
sns.set_style("white")
fig, ax = plt.subplots(1, 1, figsize=(15, 10))

# Insere curva KDE (Kernel Density Estimation)
g2 = sns.distplot(df["textblob_score"], ax=ax, 
                  kde=True, hist=False, label='Textblob')
# Insere curva KDE (Kernel Density Estimation)
g3 = sns.distplot(df["vader_score"], ax=ax, 
                  kde=True, hist=False, label='Vader')
# Insere curva KDE (Kernel Density Estimation)
g3 = sns.distplot(normalizadoComRaiz1, ax=ax, 
                  kde=True, hist=False, label='Afinn (raíz + norm)')
plt.legend()
plt.xlabel('Polaridade')
plt.xlabel('Densidade')
plt.savefig('3_scores_distribution_raíz_e_normalização.svg')
plt.savefig('3_scores_distribution_raíz_e_normalização.png')

##### Raíz quadrada e normalização 2

In [None]:
def squareRoot(value):
    if value > 0:
        return np.sqrt(value)
    if value < 0:
        return np.sqrt(abs(value)) * (-1)
    return 0

In [None]:
squares2 = df.apply(lambda row: squareRoot(squareRoot(row['afinn_score'])), axis=1)

In [None]:
normalizadoComRaiz2 = MinMaxScaler(feature_range=(-1,1)).fit_transform(squares2.values.reshape(-1,1))

In [None]:
squares2 = df.apply(lambda row: squareRoot(squareRoot(row['afinn_score'])), axis=1)
normalizadoComRaiz2 = MinMaxScaler(feature_range=(-1,1)).fit_transform(squares2.values.reshape(-1,1))

In [None]:
# Criando o ambiente do gráfico 
sns.set_style("white")
fig, ax = plt.subplots(1, 1, figsize=(15, 10))

# Insere curva KDE (Kernel Density Estimation)
g2 = sns.distplot(df["textblob_score"], ax=ax, 
                  kde=True, hist=False, label='Textblob')
# Insere curva KDE (Kernel Density Estimation)
g3 = sns.distplot(df["vader_score"], ax=ax, 
                  kde=True, hist=False, label='Vader')
# Insere curva KDE (Kernel Density Estimation)
g3 = sns.distplot(normalizadoComRaiz2, ax=ax, 
                  kde=True, hist=False, label='Afinn norm (2)')
plt.legend()
plt.xlabel('Polaridade')
plt.ylabel('Densidade')
plt.savefig('4_normalização_minmax_scale.svg')
plt.savefig('4_normalização_minmax_scale.png')

In [None]:
df['afinn_score_norm'] = normalizadoComRaiz2

### Z-score normalization

In [None]:
def zscoreNormalization(x, mean, std):
    return (x-mean)/std
    

In [None]:
mean, std = np.mean(df['afinn_score']), np.std(df['afinn_score'])

In [None]:
df['afinn_zscored'] = df.apply(lambda row: zscoreNormalization(row['afinn_score'], mean, std), axis=1)

In [None]:
# Criando o ambiente do gráfico 
sns.set_style("white")
fig, ax = plt.subplots(1, 1, figsize=(15, 10))

# Insere curva KDE (Kernel Density Estimation)
g2 = sns.distplot(df["textblob_score"], ax=ax, 
                  kde=True, hist=False, label='Textblob')
# Insere curva KDE (Kernel Density Estimation)
g3 = sns.distplot(df["vader_score"], ax=ax, 
                  kde=True, hist=False, label='Vader')
# Insere curva KDE (Kernel Density Estimation)
g3 = sns.distplot(df['afinn_zscored'], ax=ax, 
                  kde=True, hist=False, label='Afinn Z-score')
plt.legend()
plt.xlabel('Polaridade')
plt.ylabel('Densidade')
plt.savefig('5_afinn_zscore.svg')
plt.savefig('5_afinn_zscore.png')

In [None]:
afinn_zscored = np.array(df['afinn_zscored'])

In [None]:
afinn_zscored_norm = MinMaxScaler(feature_range=(-1,1)).fit_transform(afinn_zscored.reshape(-1, 1))

In [None]:
df['afinn_zscored_norm'] = afinn_zscored_norm

In [None]:
# Criando o ambiente do gráfico 
sns.set_style("white")
fig, ax = plt.subplots(1, 1, figsize=(15, 10))

# Insere curva KDE (Kernel Density Estimation)
g2 = sns.distplot(df["textblob_score"], ax=ax, 
                  kde=True, hist=False, label='Textblob')
# Insere curva KDE (Kernel Density Estimation)
g3 = sns.distplot(df["vader_score"], ax=ax, 
                  kde=True, hist=False, label='Vader')
# Insere curva KDE (Kernel Density Estimation)
g3 = sns.distplot(df['afinn_zscored_norm'], ax=ax, 
                  kde=True, hist=False, label='Afinn Z-scored norm')
plt.legend()
plt.xlabel('Polaridade')
plt.ylabel('Densidade')
plt.savefig('6_afinn_zscore_norm.svg')
plt.savefig('6_afinn_zscore_norm.png')

### Textblob normalization

In [None]:
textblob_score = df['textblob_score']
vader_score = df['vader_score']

In [None]:
textblob_norm = MinMaxScaler(feature_range=(-1,1)).fit_transform(textblob_score.values.reshape(-1,1))
vader_norm = MinMaxScaler(feature_range=(-1,1)).fit_transform(vader_score.values.reshape(-1,1))

In [None]:
# Criando o ambiente do gráfico 
sns.set_style("white")
fig, ax = plt.subplots(1, 1, figsize=(15, 10))

# Insere curva KDE (Kernel Density Estimation)
g2 = sns.distplot(textblob_norm, ax=ax, 
                  kde=True, hist=False, label='Textblob')
# Insere curva KDE (Kernel Density Estimation)
g3 = sns.distplot(vader_norm, ax=ax, 
                  kde=True, hist=False, label='Vader')
# Insere curva KDE (Kernel Density Estimation)
g3 = sns.distplot(df['afinn_score_norm'], ax=ax, 
                  kde=True, hist=False, label='Afinn Z-score')
plt.legend()
plt.xlabel('Polaridade')
plt.ylabel('Densidade')

### Textblob

In [None]:
from textblob import TextBlob

In [None]:
# determine the sentiment
def getScore(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
#Create two new columns ‘Subjectivity’ & ‘Polarity’
df['textBlob_subjectivity'] = df.progress_apply(lambda row: getSubjectivity(row['processed_text']), axis=1)
df['textBlob_polarity'] = df.progress_apply(lambda row: getPolarity(row['processed_text']), axis=1)    
df['textBlob_analysis'] = df.progress_apply(lambda row: getAnalysis(row['textBlob_polarity']), axis=1)

df.to_csv('processed_text_withemojis_tb.csv', index=False)

### Vader

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

#checks if value is string
def checkIfValueIsString(test_string):
    return isinstance(test_string, str)

# create a function to get the subjectivity
def getScores(text):
    if (checkIfValueIsString(text) == True):
        return analyzer.polarity_scores(text)
    else:
        return 0

#Create two new columns ‘Subjectivity’ & ‘Polarity’
df['vader_scores'] = df.progress_apply(lambda row: getScores(row['processed_text']), axis=1)

df.to_csv('processed_text_withemojis_tb_vd.csv', index=False)

In [None]:
df['vader_neg'] = df.progress_apply(lambda row: row['vader_scores']['neg'], axis=1)
df['vader_neu'] = df.progress_apply(lambda row: row['vader_scores']['neu'], axis=1)
df['vader_pos'] = df.progress_apply(lambda row: row['vader_scores']['pos'], axis=1)
df['vader_compound'] = df.progress_apply(lambda row: row['vader_scores']['compound'], axis=1)

In [None]:
df.head()

In [None]:
df = df[['datetime', 'tweet_id', 'text', 'processed_text',
       'textBlob_subjectivity', 'textBlob_polarity', 
       'textBlob_analysis','vader_compound']]

In [None]:
df.head()

In [None]:
df.to_csv('processed_text_withemojis_tb_vd.csv', index=False)

### Afinn

In [None]:
#importing necessary libraries
from afinn import Afinn
#instantiate afinn
afn = Afinn()

#checks if value is string
def checkIfValueIsString(test_string):
    return isinstance(test_string, str)

# create a function to get the subjectivity
def getAfinnScores(text):
    if (checkIfValueIsString(text) == True):
        return afn.score(text)
    else:
        return 0

df['afinn_score'] = df.progress_apply(lambda row: getAfinnScores(row['processed_text']), axis=1)

In [None]:
df.head()

In [None]:
df.to_csv('processed_text_withemojis_tb_vd_af.csv', index=False)

In [None]:
df['datetime'] = pd.to_datetime(df['datetime'])

In [None]:
df.columns

In [None]:
frame = df[['datetime', 'tweet_id', 'text', 'text_lower', 'processed_text', 'textBlob_polarity', 'textBlob_analysis',
       'vader_compound', 'afinn_score']]

In [None]:
frame.to_csv('processed_text_withemojis_scores.csv', index=False)

### Delimitando tweets que realmente falam sobre o metaverso

<blockquote>A força do tweet é determinada pela presença na pesquisa pelos usuários do Google Trends</blockquote>

Singular

- metaverse is (139237)
- what metaverse (2932)
- is metaverse (31952)
- what is metaverse (1794)
- what metaverse is (884)

Plural

- metaverses are (604)
- what metaverses (26)
- are metaverses (58)
- what are metaverses (0)
- what metaverses are (9)

<blockquote>Textos que fazem parte das pesquisas feitas por usuários são considerados fortes. Os tweets que contém os termos irão compor os tópicos fortemente relacionados</blockquote>

In [None]:
relatedTopics = 'metaverse is|what metaverse|is metaverse|what is metaverse|what metaverse is|metaverses are|what metaverses|are metaverses|what metaverses are'

In [None]:
df.groupby('textBlob_analysis')['text'].count()

### Tweets fracamente relacionados

In [None]:
df[~df['text_lower'].str.contains(relatedTopics, regex=True)].set_index('datetime').groupby('textBlob_analysis')['text'].resample('M').count()

### Tweets fortemente relacionados

In [None]:
df[df['text_lower'].str.contains(relatedTopics, regex=True)].set_index('datetime').groupby('textBlob_analysis')['text'].resample('M').count()

<blockquote>Como pode-se ver na tabela, para comentários fortemente relacionados ao metaverso, o número de comentários positivos supera o número de comentários negativos</blockquote>

In [None]:
# TODO: fazer merge com pd.apply e verificar de qual usuário cada base de dados pertence

In [None]:
columns=['datetime','tweet_id','username','like_count','username','user_followers','user_fav_count','is_verified','created_at']

In [None]:
tweets_users = pd.read_csv(r'\twitter-EDA\tweets_preprocessing_norm\tweets_preprocessed_english.csv', usecols=columns)

### Merging values of two columns

In [None]:
sentiment = df[['tweet_id','textBlob_analysis']]

In [None]:
text_lower = df[['tweet_id','text_lower']]

##### Inner join

In [None]:
merged_inner = pd.merge(left=merged_inner, right=text_lower, left_on='tweet_id', right_on='tweet_id')

In [None]:
import pandas as pd

#### Salvando sentimentos

In [None]:
df = pd.read_csv('sentiment_scores.csv')

In [None]:
df.columns

In [None]:
only_sentiments = df[['processed_text','textblob_sentiment', 'vader_sentiment', 'afinn_sentiment','textblob_score', 'vader_score', 'afinn_score']]

In [None]:
import dataframe_image as dfi

In [None]:
pd.set_option("display.max_column", None)
pd.set_option("display.max_colwidth", 30)
pd.set_option('display.width', -1)
pd.set_option('display.max_rows', None)

In [None]:
only_sentiments.columns = ['processed_text',
       'textblob',
       'vader',
       'afinn',
       't_score',
       'v_score',
       'a_score']

In [None]:
dfi.export(
    only_sentiments.head(20),
    "table_sentiment_and_score_.png",
    table_conversion="matplotlib"
)

#### Salvando normalização

In [None]:
strength = pd.read_csv(r'\twitter-EDA\tweets_studies_ranking\strength.csv')

In [None]:
strength.columns

In [None]:
only_norm = strength[[
       'processed_text',
       'textblob_sentiment',
       'vader_sentiment',
       'afinn_sentiment',
       'textblob_score',
       'vader_score',
       'afinn_score',
       'afinn_score_norm']]

In [None]:
only_norm.columns = ['processed_text',
       'textblob',
       'vader',
       'afinn',
       't_score',
       'v_score',
       'a_score',
       'a_norm_score']

In [None]:
dfi.export(
    only_norm.head(20),
    "table_afinn_norm.png",
    table_conversion="matplotlib"
)

In [None]:
df.head()