In [1]:
# Package imports
import itertools
import pandas as pd
import preprocessor as p
import re
import sklearn.feature_extraction
import string
import unidecode
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
from langdetect import detect

# Lambda function for printing
debug = lambda x: print(x)

In [2]:
# Create punctuation removal variable
rm_punc = str.maketrans('', '', string.punctuation)

# Create stopword removal variable
stopwords = sklearn.feature_extraction.text.ENGLISH_STOP_WORDS
temp_word = set(stopwords)
new_words_temp = [] # <- Add any additional stopwords here
for i in new_words_temp:
    temp_word.add(i)
stopwords = frozenset(temp_word)

# Create stemmer variables
porter = PorterStemmer()
lancaster = LancasterStemmer()

In [54]:
# Import JSON files containing tweet dataset(s)
tweets = pd.read_json('/Users/tomashegewisch/Downloads/030110/LIM368_20200307.json', lines=True, orient='record')

# View all rows contained in the dataset(s)
pd.set_option('display.max_rows', tweets.shape[0]+1)

In [4]:
# Check for duplicate tweets
debug("BEFORE")
debug(len(tweets))
tweets.drop_duplicates(subset=['id'], keep="first", inplace=True)
debug("\nAFTER")
debug(len(tweets))

BEFORE
1117

AFTER
1117


In [5]:
# Remove all tweets which are not English
def language(text):
    try:
        return detect(text)
    except:
        return "en"
    
debug(len(tweets))
tweets = tweets[tweets['tweet'].apply(language) == "en"]
debug(len(tweets))

1117
773


In [6]:
# Stopword and link removal
def remove_stopwords(word):
    if word in stopwords:
        return ''
    if word.startswith('http') or word.startswith('pictwittercom') or word.endswith('com') or word.endswith('coza'):
        return ''
    return word

# Preprocessing the tweet
def preprocess(text):
    clean_data = []
    for x in text:
        new_text = re.sub('<.*?>', '', x)   # remove HTML tags
        new_text = re.sub(r'[^\w\s]', '', new_text) # remove punctuation
        new_text = re.sub(r'\d+','',new_text) # remove numbers
        new_text = re.sub('\n', ' ', new_text) #remove escape characters
        new_text = new_text.lower() # lower case         
        if new_text != '':
            clean_data.append(new_text)
        temp_string = ''
        for i in clean_data:
            temp_string += i
    clean_data = temp_string
    return clean_data

# Cleaning and tokenising the tweet
def clean_tweet(tweet):
    tweet = preprocess(tweet)
    tweet = unidecode.unidecode(tweet).lower().split()
    tweet = [remove_stopwords(x) for x in tweet]
    tweet = list(itertools.chain.from_iterable([x.split() for x in tweet if x != '']))
    tweet = [x for x in tweet if len(x) > 1]
    return tweet

In [55]:
tweets['tokenised'] = tweets['tweet'].apply(clean_tweet)

In [23]:
debug(preprocess("Hi my name is Jack the Quack *snort snort*"))

hi my name is jack the quack snort snort


In [57]:
def find_non_text(text):
    if text == []:
        return "NA"

In [60]:
#DROP rows that do not have text in them...
debug("Before")
debug(len(tweets))
tweets = tweets[tweets['tokenised'].apply(find_non_text) != "NA"]
debug("After")
debug(len(tweets))

Before
1117
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
After
904


In [10]:
#tweets['tokenised']

In [11]:
words = tweets['tokenised'].tolist()
words = [item for sublist in words for item in sublist]

# generate DF out of Counter
rslt = pd.DataFrame(Counter(words).most_common(100),
                    columns=['Word', 'Frequency']).set_index('Word')
debug(rslt)

                Frequency
Word                     
im                     46
polokwane              22
just                   22
like                   21
people                 18
dont                   17
time                   16
limpopo                16
know                   15
love                   13
black                  13
need                   12
ultimateloveng         12
baroka                 12
leopards               12
mall                   11
team                   11
good                   11
eish                   11
thank                  10
come                   10
hope                    9
did                     9
eababanights            9
youre                   9
absaprem                9
kids                    9
day                     9
fc                      9
turfloop                9
north                   9
africa                  9
yes                     8
win                     8
game                    8
today                   8
lets        

In [12]:
tweets.to_pickle("test.pkl")

In [13]:
df = pd.read_pickle("test.pkl")