In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns

sns.set()

%matplotlib inline

In [None]:
data = pd.read_csv("../dataset/train.csv", index_col=0, usecols=["id", "text", "target"])
data

In [None]:
data = data.rename(columns={'target': 'real'})
data['non-real'] = data['real'].map(lambda x: 0 if x==1 else 1)
data['mentions'] = data['text'].map(lambda t: list(filter(lambda e: len(e)>1 and e[0] == '@', t.split(' '))))
data['mention_count'] = data['mentions'].map(lambda l: len(l))
del data['text']
data

## Cantidad de menciones

In [None]:
data['mention_count'].to_frame().describe()

In [None]:
data['mention_count'].sum()

In [None]:
data[data['mention_count'] > 0]

## Usuarios mencionados

In [None]:
users = data[['mentions', 'real']]\
    .where(data['mention_count'] > 0).dropna()\
    .apply(lambda r: [(u, r['real']) for u in r['mentions']], axis='columns')\
    .apply(pd.Series).stack().reset_index(drop=True).to_frame()
users['user'] = users[0].map(lambda r: r[0])
users['real'] = users[0].map(lambda r: int(r[1]))
users['total'] = users[0].map(lambda r: 1) 
del users[0]
users = users.groupby('user').sum().sort_values(by=['total', 'real'], ascending=False)
users

In [None]:
users.describe()

In [None]:
wordcloud = WordCloud(width=800, height=800, background_color='white', min_font_size=5).generate_from_frequencies(users['total'])
plt.figure(figsize = (10, 10), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 

plt.savefig('users-wordcloud.png')
plt.show() 

In [None]:
def plot_users(series, title, figname):
    fig, ax = plt.subplots(figsize=(25, 10))
    fig.suptitle(title, fontsize=40)
    ax.bar(series.index, series['total'], align='center', label='Non-real')
    ax.bar(series.index, series['real'], align='center', label='Real')
    ax.set_xlabel('User', fontsize=20)
    ax.set_ylabel('Mentions', fontsize=20)
    ax.set_xticklabels(series.index, fontsize=18 )
    ax.tick_params(axis="y", labelsize=18)
    ax.legend(loc='best', fontsize=20)
    plt.savefig(figname)

In [None]:
plot_users(users.head(12), 'Most mentioned users', 'most-mentioned-users.png')

In [None]:
plot_users(users[1:12], 'Most mentioned users (without YouTube)', 'most-mentioned-users-no-yt.png')

## Veracidad del tweet por cantidad de usuarios mencionados

In [None]:
data_by_user_mentions = data.groupby(['mention_count']).sum()
data_by_user_mentions['total'] = data_by_user_mentions['real'] + data_by_user_mentions['non-real']
data_by_user_mentions['real-normalized'] = data_by_user_mentions['real'] / data_by_user_mentions['total']
data_by_user_mentions['non-real-normalized'] = data_by_user_mentions['non-real'] / data_by_user_mentions['total']

data_by_user_mentions

In [None]:
fig, ax = plt.subplots(figsize=(30, 10))
fig.suptitle('Veracidad de tweets por user mentions', fontsize=40)
ax.bar(data_by_user_mentions.index + 0.2, data_by_user_mentions['non-real-normalized'], width=0.4, align='center', label='Non-real')
ax.bar(data_by_user_mentions.index - 0.2, data_by_user_mentions['real-normalized'], width=0.4, align='center', label='Real')
ax.autoscale(tight=True)
ax.legend(loc='upper left', fontsize=20)
ax.set_xlabel('menciones', fontsize=20)
ax.set_ylabel('% Tweets', fontsize=20)
ax.set_xticklabels(data_by_user_mentions.index, fontsize=18)
ax.tick_params(axis="y", labelsize=18)
plt.savefig('veracity-by-user-mentions.png')

El gráfico muestra que a medida que aumentan las menciones a otros usuarios, hay menos proporción de tweets verídicos.
Sin embargo, la mayoría de los tweets tienen ninguna o pocas menciones, la muestra se va achicando mucho a medida que aumentan, por lo que seguramente no sirva para definir si es verídico o no.