In [None]:
# pip install cleantext

In [None]:
import numpy as np
import pandas as pd
import string
import nltk
import os
import glob
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from cleantext import clean
import re

In [None]:
# Get base path
base_path  = os.getcwd()

# Set file path
input_file_path = base_path + '\\dataset\\'

In [None]:
# Combine all countries tweet
joined_files = os.path.join(input_file_path, "tweets_*.csv")

joined_list = glob.glob(joined_files)

# Join the files
df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
print(df)

# Write combined tweets onto a new file
df.to_csv(input_file_path + '\\' + 'all_tweets.csv', mode='a', index=False, header=False)

In [None]:
# Read the combined data set
df = pd.read_csv(input_file_path + 'all_tweets.csv')
df.columns = ['id', 'date', 'content']

df.head()

In [None]:
print('-------------------')
print('Dataset content')
print('-------------------')
print(df.head())
print('-------------------')
# Some information about the data set
print('Length of data set: ', len(df))
print('Shape of data set: ', df.shape)

print('-------------------')
print('Dataset information')
print('-------------------')
df.info()

In [None]:
# Check for null values - needs to be zero
np.sum(df.isnull().any(axis=1))
df.head()

In [None]:
# 1. Converting all text to lower case
def convert_to_lower_case(data_set):
    data_set['content'] = data_set['content'].str.lower()

convert_to_lower_case(df)
df.head()

In [None]:
# 2. Removing stop words (un-necessary words) - using nltk's pre-defined stop words
# STOP_WORDS = set(stopwords.words('english'))
# print('Stop word list:')
# print('----------------')
# print(STOP_WORDS)

# def remove_stop_words(content):
#    return " ".join([text for text in str(content).split() if text not in STOP_WORDS])

# df['content'] = df['content'].apply(lambda content: remove_stop_words(content=content))
# df.head()

In [None]:
# 3. Removing URLs
def remove_URLS(content):
    return re.sub('((www.\S+)|(http[s]?://\S+))',' ', str(content))

df['content'] = df['content'].apply(lambda content: remove_URLS(content=content))
#df['content'].head()
df.head()

In [None]:
# 4. Removing @mentions
def remove_mentions(content):
    return re.sub('(@\S+)',' ', str(content))

df['content'] = df['content'].apply(lambda content: remove_mentions(content=content))
#df['content'].head()
df.head()

In [None]:
# 5. Removing numbers
def remove_numericals(content):
        return re.sub('[0-9]+', '', content)

df['content'] = df['content'].apply(lambda content: remove_numericals(content=content))
#df['content'].head()
df.head()

In [None]:
# 6. Removing punctuations
PUNCTUATIONS = string.punctuation
print('Punctuation list:')
print('------------------')
print(PUNCTUATIONS)

def remove_punctuations(content):
    return str(content).translate(str.maketrans('', '', PUNCTUATIONS))

df['content'] = df['content'].apply(lambda content: remove_punctuations(content=content))
# df['content'].head()
df.head()

In [None]:
# 7. convert emoji to text
def emoji_convert(content):
    return kp_all_emoji_emoticons.replace_keywords(content)

from emot.emo_unicode import UNICODE_EMOJI, UNICODE_EMOJI_ALIAS, EMOTICONS_EMO
from flashtext import KeywordProcessor
all_emoji_emoticons = {**EMOTICONS_EMO,**UNICODE_EMOJI_ALIAS, **UNICODE_EMOJI_ALIAS}
all_emoji_emoticons = {k:v.replace(":","").replace("_"," ").strip() for k,v in all_emoji_emoticons.items()}
kp_all_emoji_emoticons = KeywordProcessor()
for k,v in all_emoji_emoticons.items():
    kp_all_emoji_emoticons.add_keyword(k, v)

df['content'] = df['content'].apply(lambda content: emoji_convert(content=content))

df.head()


In [None]:
# df[df.colA.map(lambda x: x.isascii())]

df1=df.copy()
# df1['DB_user'].str.encode('ascii', 'ignore').str.decode('ascii')
# df1['new'] = df1['content'].apply(lambda content: content.encode('ascii', 'ignore').str.decode('ascii'))
# df1.head()


In [None]:
# 8. Removing 2 letter random words, as it gives no meaning
def remove_2letter_words(content):
    w = re.sub(r'\b\w{1,2}\b', '', content)
    return re.sub(' +', ' ', w)

df['content'] = df['content'].apply(lambda content: remove_2letter_words(content=content))
df.head()

In [None]:
# 9. Remove unicodes
def remove_unicodes(content):
        return str(content).encode('ascii', errors='ignore').decode()

df['content'] = df['content'].apply(lambda content: remove_unicodes(content=content))
df['content'].head()

In [None]:
# 10. Remove unwanted emojis
df['content'] = df['content'].apply(lambda content: clean(str(content), no_emoji=True))
df.head()

In [None]:
# 11. Removing meaningless words
nltk.download('words')
words = set(nltk.corpus.words.words())
df['nonEnglish'] = df['content'].apply(lambda x: x.isascii() if isinstance(x, str) else False)
df = df[df['nonEnglish'] == True]
df = df.drop(['nonEnglish'], axis=1)
df['content'] = df['content'].apply(lambda content: remove_2letter_words(content=content))
# df['content'] = df['content'].apply(lambda content: remove_stop_words(content=content))
df['content'] = df['content'].apply(lambda content: remove_punctuations(content=content))

df.head()

In [None]:
# Stemming of the words
stemmer = nltk.PorterStemmer()
def stemming_content(content):
    text = [stemmer.stem(word) for word in content]
    return content

df['content']= df['content'].apply(lambda content: stemming_content(content=content))
df['content'].head()

In [None]:
# Lemmatizing the tokens
lemmatizer = nltk.WordNetLemmatizer()
def lemmatizing_content(content):
    text = [lemmatizer.lemmatize(word) for word in content]
    return content

df['content'] = df['content'].apply(lambda content: lemmatizing_content(content))
df['content'].head()

In [None]:
# Get base path
base_path  = os.getcwd()

# Set file path
input_file_path = base_path + '\\CleanedDatasets\\'

df.to_csv(input_file_path + '\\' + 'tweets_cleaned.csv', mode='a', index=False, header=False)