# 0 - Importing Packages and Dataframe

## 0.1 - Import Libraries

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import nltk
import kaggle
import re

from nltk.corpus import stopwords
import re
import string

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\I43282N\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 0.2 - Import Dataset

In [8]:
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

# Baixar o dataset
api.dataset_download_files('crowdflower/twitter-airline-sentiment', path='.', unzip=True)

Dataset URL: https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment


## 0.3 - Print DataFrame

In [9]:
df = pd.read_csv('Tweets.csv')

In [10]:
df.head(5)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


# 1.0 - Data Manipulation

## 1.1 - Normalizar os Dados (Converter para minúsculas)

In [11]:
# Converter os textos para letras minúsculas
df['text_lower'] = df['text'].str.lower()

In [12]:
# Visualizar o resultado
df[['text_lower', 'text']].head()

Unnamed: 0,text_lower,text
0,@virginamerica what @dhepburn said.,@VirginAmerica What @dhepburn said.
1,@virginamerica plus you've added commercials t...,@VirginAmerica plus you've added commercials t...
2,@virginamerica i didn't today... must mean i n...,@VirginAmerica I didn't today... Must mean I n...
3,@virginamerica it's really aggressive to blast...,@VirginAmerica it's really aggressive to blast...
4,@virginamerica and it's a really big bad thing...,@VirginAmerica and it's a really big bad thing...


## 1.2 - Remove URLs

In [13]:
# Remove URLs from text
df['text_no_urls'] = df['text_lower'].apply(lambda x: re.sub(r'http\\S+|www\\S+', '', x))

In [14]:
# Visualize the result
df[['text_lower', 'text_no_urls']].head()

Unnamed: 0,text_lower,text_no_urls
0,@virginamerica what @dhepburn said.,@virginamerica what @dhepburn said.
1,@virginamerica plus you've added commercials t...,@virginamerica plus you've added commercials t...
2,@virginamerica i didn't today... must mean i n...,@virginamerica i didn't today... must mean i n...
3,@virginamerica it's really aggressive to blast...,@virginamerica it's really aggressive to blast...
4,@virginamerica and it's a really big bad thing...,@virginamerica and it's a really big bad thing...


## 1.3 - Remove Menctioned User (@users)

In [15]:
# Remove mentions from text
df['text_no_mentions'] = df['text_no_urls'].apply(lambda x: re.sub(r'@\w+', '', x))

In [16]:
# Visualize the result
df[['text_no_urls', 'text_no_mentions']].head()

Unnamed: 0,text_no_urls,text_no_mentions
0,@virginamerica what @dhepburn said.,what said.
1,@virginamerica plus you've added commercials t...,plus you've added commercials to the experien...
2,@virginamerica i didn't today... must mean i n...,i didn't today... must mean i need to take an...
3,@virginamerica it's really aggressive to blast...,"it's really aggressive to blast obnoxious ""en..."
4,@virginamerica and it's a really big bad thing...,and it's a really big bad thing about it


## 1.4 - Remove Hashtag and Punctuation

In [17]:
# Remove the Hashtag and the Punctuation
df['text_no_hashtags_punct'] = df['text_no_mentions'].apply(lambda x: re.sub(r'#\w+', '', x)).apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [18]:
# Visualize the result
df[['text_no_urls', 'text_no_hashtags_punct']].head()

Unnamed: 0,text_no_urls,text_no_hashtags_punct
0,@virginamerica what @dhepburn said.,what said
1,@virginamerica plus you've added commercials t...,plus youve added commercials to the experienc...
2,@virginamerica i didn't today... must mean i n...,i didnt today must mean i need to take anothe...
3,@virginamerica it's really aggressive to blast...,its really aggressive to blast obnoxious ente...
4,@virginamerica and it's a really big bad thing...,and its a really big bad thing about it


## 1.5 - Remove Numbers and Stopwords

In [19]:
stop_words = set(stopwords.words('english'))

# Remove numbers and stopwords
df['clean_text'] = df['text_no_hashtags_punct'].apply(
    lambda x: re.sub(r'\\d+', '', x) # Remove numbers
).apply(
    lambda x: ' '.join([word for word in x.split() if word not in stop_words]) # Remove stopwords
)

In [20]:
# Visualize the result
df[['text_no_hashtags_punct', 'clean_text']].head()

Unnamed: 0,text_no_hashtags_punct,clean_text
0,what said,said
1,plus youve added commercials to the experienc...,plus youve added commercials experience tacky
2,i didnt today must mean i need to take anothe...,didnt today must mean need take another trip
3,its really aggressive to blast obnoxious ente...,really aggressive blast obnoxious entertainmen...
4,and its a really big bad thing about it,really big bad thing
