# Data Cleaning

Having acquired the data, this notebook will clean and preprocess text and extract new metadata.

In [1]:
import pandas as pd
import numpy as np
import re
import pyLDAvis.sklearn
from datetime import datetime as dt

In [2]:
df = pd.read_pickle('../data/interim/aggregated.pkl')

The below shows lines we need to drop:

In [3]:
df.loc[df.Posts.isnull()].head()

Unnamed: 0,GUID,Date (GMT),URL,Contents,Author,Name,Country,State/Region,City/Urban Area,Category,Emotion,Source,Klout Score,Gender,Posts,Followers,Following
2267654,817337076572102656,,http://twitter.com/Solutionprovida/status/8173...,http://twitter.com/Solutionprovida/status/8173...,,,United Kingdom,North West,Liverpool,,,Twitter,51.0,M,,,
2267655,817352877421248512,,http://twitter.com/shaancheema/status/81735287...,http://twitter.com/shaancheema/status/81735287...,,,United Kingdom,Greater London,London,,,Twitter,53.0,,,,
2267656,817452492174790656,,http://twitter.com/AWarwickThomps1/status/8174...,http://twitter.com/AWarwickThomps1/status/8174...,,,United Kingdom,,,,,Twitter,42.0,,,,
2267657,817330843609874432,,http://twitter.com/martytechno1/status/8173308...,http://twitter.com/martytechno1/status/8173308...,,,,,,,,Twitter,43.0,M,,,
2267658,817494566832078848,,http://twitter.com/achairukdpc/status/81749456...,http://twitter.com/achairukdpc/status/81749456...,,,United Kingdom,,,,,Twitter,35.0,F,,,


In [4]:
from datetime import datetime
from dateutil.parser import parse

In [5]:
def remove_handles(text):
    return re.sub('@[^\s]+','',text)

def remove_hashtags(text):
    return re.sub('#[^\s]+',string=text,repl='')

def remove_RT(text):
    return re.sub('^RT ',string=text,repl='')

def remove_url(text):
    return re.sub('http[^\s]+',string=text,repl='')

def process_text(text):
    return (remove_url(remove_RT(remove_hashtags(remove_handles(text))))).strip()

def process_text_ht(text):
    return (remove_url(remove_RT(remove_handles(text)))).strip()

In [6]:
df['StrippedHasHashtag'] = df['Contents'].map(process_text_ht)

In [7]:
df['Date (GMT)'] = df['Date (GMT)'].map(lambda x : dt.strptime(x, '%d/%m/%Y %H:%M'))

TypeError: strptime() argument 1 must be str, not float

In [9]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Lars Buitinck
#         Chyi-Kwei Yau <chyikwei.yau@gmail.com>
# License: BSD 3 clause

from __future__ import print_function
from time import time
import re

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
#from sklearn.datasets import fetch_20newsgroups
import docx

n_samples = 200
n_features = 5000
n_topics = 10
n_top_words = 10

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()


# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

# print("Loading dataset...")
# t0 = time()
# dataset = fetch_20newsgroups(shuffle=True, random_state=1,
#                              remove=('headers', 'footers', 'quotes'))
# data_samples = dataset.data[:n_samples]
# print("done in %0.3fs." % (time() - t0))

data_samples = df_ge.StrippedHasHashtag.sample(n=10000,random_state=123)

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df = 2, max_features=n_features,
                                stop_words='english',
                               ngram_range = (1,2))
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

print('Max number of times a word appears in a sentence is %d, min is %d.\n' % (tf.A.max(),tf.A.min()))

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (len(data_samples), n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=50,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0,doc_topic_prior = 0.001)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Extracting tf features for LDA...
done in 1.641s.
Max number of times a word appears in a sentence is 10, min is 0.

Fitting LDA models with tf features, n_samples=10000 and n_features=5000...
done in 167.603s.

Topics in LDA model:
Topic #0:
great, today, labour, eu, campaign, ge2017, thanks, team, thank, campaigning
Topic #1:
uk, yemen, stoparmingsaudi, saudi, redlineforyemen, stop, stoparmingsaudi redlineforyemen, strong, post, tax
Topic #2:
parliament, proud, trump, just, commons, moment, speaker, welcome, racism, visit
Topic #3:
theresa, debate, rt, british, leader, pm, bbcdebate, vote, single, mps
Topic #4:
manifesto, just, cut, make, like, workers, does, oh, rights, words
Topic #5:
votesnp, ge17, support, schools, tories, did, want, labour, women, scotland
Topic #6:
labour, tory, don, party, vote, need, brexit, government, hope, right
Topic #7:
like, britain, day, left, world, power, 50, real, looks, city
Topic #8:
time, school, year, free, june, just, media, vote, young, meet
T

In [11]:
test = pyLDAvis.sklearn.prepare(lda_model=lda, vectorizer=tf_vectorizer, dtm=tf)
pyLDAvis.display(test)