# Read Data

In [1]:
# Import Libraries and Data
from transformers import pipeline
import pandas as pd
import warnings 

warnings.simplefilter(action='ignore', category=FutureWarning)

df = pd.read_csv("source data/twitter_human_bots_dataset.csv", index_col=0)
df.head()
df_sentiment = df[['screen_name', 'description', 'account_type']]

  from .autonotebook import tqdm as notebook_tqdm


## Sentiment Analysis for `description` feature

In [4]:
# Preprocess the description column
description = df_sentiment['description'].fillna('').str.lower()
description = description.str.replace(r'@\w+', '', regex=True)
description = description.str.replace(r'http\S+', '', regex=True)
description = description.str.replace('#', '', regex=True)
description = description.str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)

# Get description statistics
print(len(description))
description_lengths = description.apply(len)
print(description_lengths.describe())
print(description)

# Initialise model
model_name = 'bert-base-uncased'
sentiment_model = pipeline('sentiment-analysis')

# Get sentiment of the description
sentiments = description.apply(lambda text: sentiment_model(text)[0])

df1 = pd.DataFrame(sentiments.tolist())
print(df1.head())
# Takes 25 minutes to run


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


37438
count    37438.000000
mean        54.568273
std         48.785776
min          0.000000
25%          7.000000
50%         43.000000
75%         96.000000
max        173.000000
Name: description, dtype: float64
0        blame  inspired by  using cmu phonetic data to...
1        photographing the american west since 1980 i s...
2        scruffy looking nerf herder and  broadcaster\r...
3        wifegodmotherfriendfeline fanatic assistant pr...
4                              loan coach at   aspiring dj
                               ...                        
37433    role stock taker past roles nanny sales assist...
37434                       kingdom landlord freecornbread
37435        bienvenid al twitter oficial de sergio dalma 
37436    just a good guy wrapped up in a bad system\r\n...
37437                                                     
Name: description, Length: 37438, dtype: object


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


      label     score
0  NEGATIVE  0.999629
1  POSITIVE  0.992683
2  NEGATIVE  0.998187
3  NEGATIVE  0.570852
4  NEGATIVE  0.720050


### Store into original dataframe

In [8]:
# Add sentiment label to original df
df_sentiment['sentiment_label'] = df1['label'].map({'NEGATIVE': 0, 'POSITIVE': 1})

df_sentiment.groupby('account_type')['sentiment_label'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sentiment['sentiment_label'] = df1['label'].map({'NEGATIVE': 0, 'POSITIVE': 1})


account_type  sentiment_label
bot           1                   9294
              0                   3131
human         1                  14152
              0                  10861
Name: count, dtype: int64

### Save into external csv file

In [9]:
# Select relevant columns
df_sentiment = df_sentiment[['screen_name', 'sentiment_label']]

df_sentiment.to_csv('cleaned data/df_sentiment.csv', index=False)