# Data Cleaning

In the notebook, we will take a look at the structure of data and clean as necessary.

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /Users/a200/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/a200/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
data = pd.read_csv('tweet_emotions.csv')

In [3]:
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [4]:
sentiment_counts = data['sentiment'].value_counts()
print(sentiment_counts)

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64


In [5]:
# Deleting Empty Lines
data = data[data['content'] != '']

def clean_text(text):
    # Removing URLs
    text = re.sub(r'http\S+', '', text)
    # Removing User Mentions and Hashtag symbols
    text = re.sub(r'[@#][\w]*', '', text)
    # Removing RT（Retweet）
    text = re.sub(r'RT', '', text)
    # Removing Special Characters and Punctuation
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Removing numbers from tweet text
    text = re.sub(r'\d', '', text)
    # Converting to lowercase
    text = text.lower()
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words]
    # Removing Stop Words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

data['cleaned_content'] = data['content'].apply(clean_text)
data.to_csv('cleaned_data.csv', index=False)
data.head()

Unnamed: 0,tweet_id,sentiment,content,cleaned_content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...,know wa listenin bad habit earlier started fre...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...,layin n bed headache ughhhhwaitin call
2,1956967696,sadness,Funeral ceremony...gloomy friday...,funeral ceremonygloomy friday
3,1956967789,enthusiasm,wants to hang out with friends SOON!,want hang friend soon
4,1956968416,neutral,@dannycastillo We want to trade with someone w...,want trade someone ha houston ticket one
