In [None]:
# utilities
import numpy as np
import pandas as pd
import string
import os

# pre-preocessing utilties
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from cleantext import clean
import re

In [None]:
# Get base path
base_path  = os.path.normpath(os.getcwd() + os.sep + os.pardir)
# Set file path
file_path = base_path + '\\RawDatasets\\'

print(file_path)

In [None]:
# Read the data set
df = pd.read_csv(file_path + 'tweets_india.csv')
df.columns = ['id', 'date', 'content']

print(df.head())

# Some information about the data set
print('Length of data set: ', len(df))
print('Shape of data set: ', df.shape)
print('Dataset information')
print('-------------------')
df.info()

# Check for null values - needs to be zero
np.sum(df.isnull().any(axis=1))

In [None]:
# 1. Converting all text to lower case
def convert_to_lower_case(data_set):
    data_set['content'] = data_set['content'].str.lower()

convert_to_lower_case(df)
df.head()


In [None]:
# 2. Removing stop words (un-necessary words) - using nltk's pre-defined stop words

STOP_WORDS = set(stopwords.words('english'))
print('Stop word list:')
print('----------------')
print(STOP_WORDS)

def remove_stop_words(content):
   return " ".join([text for text in str(content).split() if text not in STOP_WORDS])

df['content'] = df['content'].apply(lambda content: remove_stop_words(content=content))
df.head()

In [None]:
# 3. Removing URLs

def remove_URLS(content):
    return re.sub('((www.[^s]+)|(https?://[^s]+))',' ', str(content))

df['content'] = df['content'].apply(lambda content: remove_URLS(content=content))
df['content'].head()


In [None]:
# 4. Removing @mentions

def remove_mentions(content):
    return re.sub('(@\S+)',' ', str(content))

df['content'] = df['content'].apply(lambda content: remove_mentions(content=content))
df['content'].head()

In [None]:
# 5. Removing numbers

def remove_numericals(content):
        return re.sub('[0-9]+', '', content)

df['content'] = df['content'].apply(lambda content: remove_numericals(content=content))
df['content'].head()

In [None]:
# 6. Removing punctuations
PUNCTUATIONS = string.punctuation
print('Punctuation list:')
print('------------------')
print(PUNCTUATIONS)
print('------------------')

def remove_punctuations(content):
    return str(content).translate(str.maketrans('', '', PUNCTUATIONS))

df['content'] = df['content'].apply(lambda content: remove_punctuations(content=content))
df['content'].head()

In [None]:
# Remove unicodes
def remove_unicodes(content):
        return str(content).encode('ascii', errors='ignore').decode()

df['content'] = df['content'].apply(lambda content: remove_unicodes(content=content))
df['content'].head()


In [None]:
# 7. Remove unwanted emojis

df['content'] = df['content'].apply(lambda content: clean(str(content), no_emoji=True))
df['content'].head()


In [None]:
# 8. Tokenizing the texts
df['content'] = df['content'].apply(lambda content: word_tokenize(str(content)))
df['content'].head()

In [None]:
# 9. Stemming of the words
stemmer = nltk.PorterStemmer()
def stemming_content(content):
    text = [stemmer.stem(word) for word in content]
    return content

df['content']= df['content'].apply(lambda content: stemming_content(content=content))
df['content'].head()

In [None]:
# 10. Lemmatizing the tokens
lemmatizer = nltk.WordNetLemmatizer()
def lemmatizing_content(content):
    text = [lemmatizer.lemmatize(word) for word in content]
    return content

df['content'] = df['content'].apply(lambda content: lemmatizing_content(content))
df['content'].head()

In [None]:
# Writing pre-processed data into csv
output_file_path = 'd:\\Varshini\\CourseWork\\Machine Learning\\project\\MachineLearning-group-project\\Preprocessing'
df.to_csv('tweets_india_preprocessed.csv', index=False)