# NLP Pipeline Prototyping

Load and preprocess FB & TW data.

In [1]:
import os, sys, re, string
sys.path.append("..")
from config import credentials
import dropbox

import numpy as np
import pandas as pd

import nltk
nltk.data.path.append("../data/external/nltk_data")
from nltk import word_tokenize

import matplotlib.pyplot as plt
%matplotlib inline

## Loading

Paths

In [2]:
data_path = "/Data/CSVData"

fb_posts_path = os.path.join(data_path, "FBPolTimeLines.csv")
fb_comments_path = os.path.join(data_path, "FBUserComments.csv")
twitter_posts_path = os.path.join(data_path, "TwPolTimeLines.csv")
twitter_comments_path = os.path.join(data_path, "TwUserComments.csv")

Load datasets

In [3]:
team_dbx = dropbox.DropboxTeam(credentials.dropbox_team_access_token)
team_root = team_dbx.with_path_root(dropbox.common.PathRoot.namespace_id(
    credentials.dropbox_team_namespace_id))
user_dbx = team_root.as_user(credentials.dropbox_team_member_id)

_, res = user_dbx.files_download(fb_posts_path)
fb_posts = pd.read_csv(res.raw)
fb_posts["media_source"] = "fb"
_, res = user_dbx.files_download(fb_comments_path)
fb_comments = pd.read_csv(res.raw)
fb_comments["media_source"] = "fb"

_, res = user_dbx.files_download(twitter_posts_path)
twitter_posts = pd.read_csv(res.raw)
twitter_posts["media_source"] = "tw"
_, res = user_dbx.files_download(twitter_comments_path)
twitter_comments = pd.read_csv(res.raw)
twitter_comments["media_source"] = "tw"

Merge datasets

In [4]:
raw_data = pd.concat([fb_posts, fb_comments, twitter_posts, twitter_comments], sort=False)
assert raw_data.shape[0] == (fb_posts.shape[0] + fb_comments.shape[0] +
                             twitter_posts.shape[0] + twitter_comments.shape[0]), "False samples count"

print("Shape:", raw_data.shape)

Shape: (172047, 41)


## Preprocessing

Drop rows w/o text

In [5]:
data = raw_data.dropna(subset=["text"], how="all")
print("Removed rows:", raw_data.shape[0] - data.shape[0])

Removed rows: 3533


Remove unnecessary columns

In [6]:
remove_cols = ["userID", "id", "replyToID", "replyToUser", "origPost", "Level", "Sampled",
               "CommentsAvail", "RepAvail", "FileNum", "verified", "quoted_text",
               "quoted_screen_name", "quoted_status_id", "quoted_favorite_count",
               "quoted_retweet_count", "quoted_verified", "retweet_text", "retweet_screen_name",
               "retweet_status_id", "retweet_favorite_count", "retweet_retweet_count", "retweet_verified",
               "source", "replyLevel"]

data_cleaned = data.drop(remove_cols, axis=1)

Preprocess text:
* Lowercase ?
* Numbers ?
* Punctuation ?
    * Mentions
    * Hashtags

In [7]:
def clean_text(text):
    """ Clean text string """
    lowercased = text.lower()
    punct_removed = lowercased.translate(str.maketrans("", "", string.punctuation)) #FIXME lower quotation marks
    num_replaced = re.sub(r"\b\d+\b", "NUM", punct_removed)
    return num_replaced

data_cleaned["cleaned_text"] = data_cleaned.text.apply(clean_text)

Tokenize text

In [8]:
data_cleaned["tokens"] = data_cleaned.cleaned_text.apply(word_tokenize)

## Dump cleaned dataset

To use for Sentiment Analysis and Topic Modelling.

In [9]:
dump_path = os.path.join(data_path, "Test_CleanedTextDF.csv")
# user_dbx.files_upload(bytes(data_cleaned.to_csv(index=False), "utf-8"),
#                       dump_path, mode=dropbox.files.WriteMode.overwrite)