In [None]:
import pandas as pd
import numpy as np

# Calculando o sentimento mensal

In [None]:
df = pd.read_csv('all_features.csv')

In [None]:
pd.set_option('display.precision', 2)
pd.set_option('display.float_format',  '{:,.2f}'.format)

In [None]:
df.head(1)

# Seleção de Atributos

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from matplotlib import pyplot
import numpy as np

In [None]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# prepare input data
def prepare_inputs(X_train, X_test):
	oe = OrdinalEncoder()
	oe.fit(X_train)
	X_train_enc = oe.transform(X_train)
	X_test_enc = oe.transform(X_test)
	return X_train_enc, X_test_enc

# prepare target
def prepare_targets(y_train, y_test):
	le = LabelEncoder()
	le.fit(y_train)
	y_train_enc = le.transform(y_train)
	y_test_enc = le.transform(y_test)
	return y_train_enc, y_test_enc

# feature selection
def select_features(X_train, y_train, X_test):
	fs = SelectKBest(score_func=mutual_info_classif, k='all')
	fs.fit(X_train, y_train)
	X_train_fs = fs.transform(X_train)
	X_test_fs = fs.transform(X_test)
	return X_train_fs, X_test_fs, fs

In [None]:
X = np.array(df[['has_link','has_spam','user_presence','presence_min']])
y = np.array(df['textblob_sentiment'])
names = np.array(['has_link','has_spam','user_presence','presence_min'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# prepare input data
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
# prepare output data
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)

In [None]:
# feature selection
X_train_fs, X_test_fs, fs = select_features(X_train_enc, y_train_enc, X_test_enc)

In [None]:
# what are scores for the features
for i in range(len(fs.scores_)):
	print('Feature %d %s: %f' % (i, names[i], fs.scores_[i]))
# plot the scores
pyplot.bar([i for i in range(len(fs.scores_))], fs.scores_)
pyplot.show()

In [None]:
import dataframe_image as dfi

In [None]:
df[['text_lower','presence_min']].head(20)

In [None]:
presence = df[['text_lower','presence_min']].head(20)

In [None]:
df['presence_min'].value_counts()

In [None]:
dfi.export(
    presence,
    "user_presence.png",
    table_conversion="matplotlib"
)

In [None]:
df.value_counts('presence_min')

In [None]:
def mergePresence(value):
    if value == '1 ou menos':
        return '1-10'
    if value == 'mais de 1 Até 5':
        return '1-10'
    if value == 'entre 5 e 10':
        return '1-10'
    if value == 'entre 10 e 50':
        return '10-100'
    if value == 'entre 50 e 100':
        return '10-100'
    if value == 'entre 100 e 1000':
        return 'equal or over 100'
    if value == 'entre 1000 e 10000':
        return 'equal or over 100'
    if value == 'mais de 10000':
        return 'over 10000'

In [None]:
df[df['has_spam'] == 'has_spam']['text_lower'][3667774]

In [None]:
df['presence_min'] = df.apply(lambda row: mergePresence(row['user_presence']), axis=1)

# Geral

#### Quantidade de tweets por mês

In [None]:
month_key = pd.Grouper(freq='M')
week_key = pd.Grouper(freq='W')
df['datetime'] = pd.to_datetime(df['datetime'])

In [None]:
df.set_index('datetime').groupby([month_key])[['tweet_id']].count()

In [None]:
df.set_index('datetime').groupby([month_key])[['afinn_score_norm']].sum()

# Positivos

In [None]:
positive = df[df['afinn_sentiment'] == 'Positive']
positive.set_index('datetime').groupby([month_key])[['afinn_score_norm']].sum()

In [None]:
positive.set_index('datetime').groupby([month_key])[['tweet_id']].count()

In [None]:
136098
323106
257998
257306
200357
197510
221623

# Positivos & Fortes

In [None]:
positive_strong = df[(df['strength'] == 'strong')]

In [None]:
positive_strong = positive_strong[positive_strong['textblob_sentiment'] == 'Positive']
positive_strong.set_index('datetime').groupby([month_key])[['textblob_score']].sum()

In [None]:
positive_strong.set_index('datetime').groupby([month_key])[['tweet_id']].count()

# Positivos & Fortes & Really Não-Spam

In [None]:
month_key = pd.Grouper(freq='M')
week_key = pd.Grouper(freq='W')
positive_strong_nospam = df[(df['strength'] == 'strong') & (df['has_link'] == 'no_link') & (df['has_spam'] == 'no_spam') & (df['user_presence'] == '1 ou menos')]
positive_strong_nospam = positive_strong_nospam[positive_strong_nospam['textblob_sentiment'] == 'Positive']
positive_strong_nospam.set_index('datetime').groupby([month_key])[['textblob_score']].sum()

In [None]:
positive_strong_nospam.set_index('datetime').groupby([month_key])[['tweet_id']].count()

In [None]:
313,72
501,60
469,26
562,64
381,68
240,19
213,82

# Negativos & Fortes & Não Spam

In [None]:
month_key = pd.Grouper(freq='M')
week_key = pd.Grouper(freq='W')
negative_strong_spam = df[(df['strength'] == 'strong') & (df['has_link'] == 'no_link') & (df['has_spam'] == 'no_spam')]
negative_strong_spam = negative_strong_spam[negative_strong_spam['textblob_sentiment'] == 'Negative']
negative_strong_spam.set_index('datetime').groupby([month_key])[['textblob_score']].sum()

In [None]:
negative_strong_spam.set_index('datetime').groupby([month_key])[['tweet_id']].count()

# Negativos & Fracos & Spam

In [None]:
month_key = pd.Grouper(freq='M')
week_key = pd.Grouper(freq='W')
negative_weak_spam = df[(df['strength'] == 'weak') & (df['has_link'] == 'has_link') & (df['has_spam'] == 'has_spam')]
negative_weak_spam = negative_weak_spam[negative_weak_spam['afinn_sentiment'] == 'Negative']
negative_weak_spam.set_index('datetime').groupby([month_key])[['afinn_score_norm']].sum()

In [None]:
negative_weak_spam.set_index('datetime').groupby([month_key])[['tweet_id']].count()

In [None]:
-1937,85
-4539,52
-5281,71
-6592,50
-6217,81
-6202,19
-6938,73