In [None]:
import pandas as pd
import numpy as np

### Identifying spams in strong related

In [None]:
df = pd.read_csv("spamming_detected_all_dataset.csv")
df.count()

In [None]:
df.groupby('strength').count()

In [None]:
text = df.text_lower[1]

In [None]:
text

In [None]:
import re

In [None]:
def checkIfHaveLinks(instancia):
    instancia = re.sub(r"http\S+", "has_link", instancia).lower()
    instancia = re.sub(r"https\S+", "has_link", instancia).lower()
    if "has_link" in instancia:
        return "has_link"
    else:
        return "no_link"

In [None]:
df['has_link'] = df.apply(lambda row: checkIfHaveLinks(row['text_lower']), axis=1)

In [None]:
df.groupby('has_link').count()

In [None]:
df[df['has_link'] == 'has_link']

<blockquote>TODO: usar o count vectorizer para contar os termos em geral e excluir destes fortemente relacionados os tweets que contém os termos não-relacionados (#NFT #Blockchain).<blockquote>



In [None]:
df_users = pd.read_csv(r'\twitter-EDA\tweets_preprocessing\identify_number_of_posts_per_user.csv')

In [None]:
df_users.head()

In [None]:
df_users.groupby('numberOfPosts')[['username']].count()

In [None]:
users = df_users[['tweet_id','numberOfPosts']]

In [None]:
merge = pd.merge(left=df, right=users, left_on='tweet_id', right_on='tweet_id')

In [None]:
merge.columns = ['datetime', 'tweet_id', 'processed_text', 'text_lower',
       'textblob_sentiment', 'vader_sentiment', 'afinn_sentiment',
       'textblob_score', 'vader_score', 'afinn_score', 'afinn_score_norm',
       'strength', 'has_link', 'has_spam', 'user_presence']

In [None]:
merge.to_csv('all_dataset_processed.csv', index=False)

In [None]:
merge.columns

### Identificando termos comuns utilizados

In [None]:
# import CountVectorizer to count the number of times each word occurs
from sklearn.feature_extraction.text import CountVectorizer
# Create an array that shows the number of times specific terms appear in column text
cv = CountVectorizer(ngram_range = (1,1))
count_matrix = cv.fit_transform(df.processed_text)
# create dataframe
word_count = pd.DataFrame(cv.get_feature_names(), columns = ['term'])
# sum the presence of terms and turn it into a list
word_count["count"] = count_matrix.sum(axis=0).tolist()[0]
word_count = word_count.sort_values("count", ascending=False).reset_index(drop=True)
# most used words
word_count[:]

In [None]:
word_count[word_count['term'] == 'token']

#### Salvando imagens

In [None]:
import dataframe_image as dfi

##### strength

In [None]:
strength = merge[['text_lower','strength', 'has_link', 'has_spam', 'user_presence']]
dfi.export(
    strength.head(20),
    "table_strength.png",
    table_conversion="matplotlib"
)

##### spam

In [None]:
spam = merge[['text_lower','has_link', 'has_spam', 'user_presence']]
dfi.export(
    spam.head(20),
    "table_spam.png",
    table_conversion="matplotlib"
)

##### spam isolated

In [None]:
spam = merge[['text_lower','user_presence']]
dfi.export(
    spam.head(20),
    "table_spam_user.png",
    table_conversion="matplotlib"
)

In [None]:
dfi.export(
    only_norm.head(20),
    "table_afinn_norm.png",
    table_conversion="matplotlib"
)

### Algoritmo pra detectar spamming terms

In [None]:
spammingTerms = ['thil','nft','crypt','blockchain','asset','token']

def checkIfHaveSpammingTerms(text):
    for term in spammingTerms:
        if term in text:
            return "has_spam"
    return "no_spam"

df['has_spam'] = df.apply(lambda row: checkIfHaveSpammingTerms(row['text_lower']), axis=1)

#### Algoritmo para detectar spam por links

In [None]:
import re

def checkIfHaveLinks(text):
    text = re.sub(r"http\S+","has_link",text).lower()
    text = re.sub(r"https\S+","has_link",text).lower()
    if "has_link" in text:
        return "has_link"
    else:
        return "no_link"

df['has_link'] = df.apply(lambda row: checkIfHaveLinks(row['text_lower']), axis=1)

#### Algoritmo para detectar força da relação

In [None]:
relatedTopics = [
    "metaverse is",
    "what metaverse",
    "is metaverse",
    "metaverses are",
    "what metaverses",
    "are metaverses"
]

def checkIfRelated(text):
    for term in relatedTopics:
        if term in text:
            return 'strong'
    return 'weak'

df['strength'] = df.apply(lambda row: checkIfRelated(row['text_lower']), axis=1)

In [None]:
word_count[word_count['term'].str.contains('token')]

##### Identificando usuários spammers

In [None]:
merge.groupby(['strength','has_spam','user_presence'])[['tweet_id']].count()

##### Balanceando a base por categoria

In [None]:
pd.set_option('display.precision', 2)
pd.set_option('display.float_format',  '{:,.2f}'.format)

In [None]:
import numpy as np
from sklearn.utils import resample

In [None]:
merge[(merge['has_link'] == 'no_link') & (merge['has_spam'] == 'no_spam') & (merge['user_presence'] == '1 ou menos')]

In [None]:
merge['user_presence'].value_counts()

In [None]:
dataset_minor = merge[merge['strength'] == 'strong']
dataset_major = merge[merge['strength'] == 'weak']

Fracos e Não spam

In [None]:
dataset_major[(dataset_major['has_link'] == 'no_link') & ((dataset_major['has_spam'] == 'no_spam'))].groupby(['textblob_sentiment'])[['tweet_id']].count()

Fortes e Não spam

In [None]:
dataset_minor[(dataset_minor['has_link'] == 'no_link') & ((dataset_minor['has_spam'] == 'no_spam'))].groupby(['textblob_sentiment'])[['tweet_id']].count()

Fracos e Spam

In [None]:
dataset_major[(dataset_major['has_link'] == 'has_link') & ((dataset_major['has_spam'] == 'has_spam'))].groupby(['textblob_sentiment'])[['tweet_id']].count()

Fortes e Spam

In [None]:
dataset_minor[(dataset_minor['has_link'] == 'has_link') & ((dataset_minor['has_spam'] == 'has_spam'))].groupby(['textblob_sentiment'])[['tweet_id']].count()

In [None]:
X_undersampled, y_undersampled = resample(merge[merge['strength'] == 'weak'], merge[merge['strength'] == 'strong'],
                replace=True,
                n_samples=merge[merge['strength'] == 'strong'].shape[0],
                random_state=123)