In [1]:
import warnings

warnings.filterwarnings("ignore")


#### Import data set

In [2]:
import pandas as pd
from textblob import TextBlob
import numpy as np
from PIL import Image
import re
import string
from bs4 import BeautifulSoup
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt

df = pd.read_csv(
    "Task-1 tweets_1000.csv",
    header=None,
    error_bad_lines=False,
    encoding="utf8",
    names=["text"],
)


b'Skipping line 8: expected 1 fields, saw 5\nSkipping line 10: expected 1 fields, saw 4\nSkipping line 11: expected 1 fields, saw 4\nSkipping line 14: expected 1 fields, saw 2\nSkipping line 15: expected 1 fields, saw 2\nSkipping line 19: expected 1 fields, saw 2\nSkipping line 20: expected 1 fields, saw 5\nSkipping line 24: expected 1 fields, saw 4\nSkipping line 40: expected 1 fields, saw 2\nSkipping line 46: expected 1 fields, saw 3\nSkipping line 56: expected 1 fields, saw 2\nSkipping line 57: expected 1 fields, saw 2\nSkipping line 59: expected 1 fields, saw 3\nSkipping line 69: expected 1 fields, saw 2\nSkipping line 72: expected 1 fields, saw 3\nSkipping line 74: expected 1 fields, saw 2\nSkipping line 78: expected 1 fields, saw 3\nSkipping line 87: expected 1 fields, saw 2\nSkipping line 91: expected 1 fields, saw 2\nSkipping line 95: expected 1 fields, saw 2\nSkipping line 97: expected 1 fields, saw 5\nSkipping line 100: expected 1 fields, saw 2\nSkipping line 101: expected 1 

In [3]:
df.head()


Unnamed: 0,text
0,\xf0\x9f\x98\x91\xf0\x9f\x98\x91\xf0\x9f\x98\x...
1,Jasmine Strange shares a message of hope durin...
2,I gotta fight these allergies in public to mak...
3,https://t.co/57NBQ2XQsG On Easter please reme...
4,@lenibriscoe I have a cute one made from recyc...


#### Preprocess text

In [4]:
def clean_string(text):

    final_string = ""

    # Make lower
    text = text.lower()

    rules = [
        {r">\s+": ">"},  # remove spaces after a tag opens or closes
        {r"\s+": " "},  # replace consecutive spaces
        {r"\s*<br\s*/?>\s*": "\n"},  # newline after a <br>
        {r"</(div)\s*>\s*": "\n"},  # newline after </p> and </div> and <h1/>...
        {r"</(p|h\d)\s*>\s*": "\n\n"},  # newline after </p> and </div> and <h1/>...
        {r"<head>.*<\s*(/head|body)[^>]*>": ""},  # remove <head> to </head>
        {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r"\1"},  # show links instead of texts
        {r"[ \t]*<[^<]*?/?>": ""},  # remove remaining tags
        {r"^\s+": ""},  # remove spaces at the beginning
    ]
    for rule in rules:
        for (k, v) in rule.items():
            regex = re.compile(k)
            text = regex.sub(v, text)
            text = text.rstrip()

    # Remove line breaks
    text = re.sub(r"\n", "", text)

    # Remove puncuation
    translator = str.maketrans("", "", string.punctuation)
    text = text.translate(translator)

    # Remove stop words
    text = text.split()
    useless_words = nltk.corpus.stopwords.words("english")
    useless_words = useless_words + ["hi", "im"]
    text_filtered = [word for word in text if not word in useless_words]

    # Remove numbers
    text_filtered = [re.sub(r"\w*\d\w*", "", w) for w in text_filtered]

    # Lemmatization
    lem = WordNetLemmatizer()
    text_stemmed = [lem.lemmatize(y) for y in text_filtered]

    # Correct the text
    final_string = " ".join(text_stemmed)
    final_string = str(TextBlob(final_string).correct())
    return final_string


In [5]:
# Apply vectorization to clean string
df['text'] = np.vectorize(clean_string)(df['text'])

#### Save the clean data to pickle file

In [6]:
import pickle

# Save the file to pickle
with open('cleaned text.pkl', 'wb') as f:
    pickle.dump(df['text'].values, f)