In [1]:
import os
import csv
import pandas as pd
import numpy as np
import glob
# preprocess!
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import WhitespaceTokenizer

tokenizer = WhitespaceTokenizer()
stop_words = set(stopwords.words('english'))
stop_words.update(['rt'])  # remove the retweet tag!

stemmer = SnowballStemmer("english")

import re

import sys
import time

import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [2]:
def merge_sents(sent):
    return ' '.join(sent)


In [3]:
def remove_links_and_html(sentence):
    sentence = re.sub(r'http\S+', '', sentence)
    sentence = re.sub(r'<[^<]+?>', '', sentence)

    return sentence

def remove_punct(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

def remove_mentions(sentence):
    # keep the @ to check for mentions among separate groups
    return re.sub(r'@#?\b\w\w+\b', '@', sentence)

def valid_token(tok):
    if '#' in tok:
        # make sure the hashtag is alphanumeric (avoiding arabic etc)
        return re.sub('[^0-9a-zA-Z]+', '', tok) != ''
    non_stop = tok not in stop_words
    no_rt = 'rt' not in tok
    is_latin = re.sub('[^0-9a-zA-Z]+', '', tok) == tok
    return is_latin and non_stop

def clean_stopwords(sentence):
    tokens = tokenizer.tokenize(sentence)
    return ' '.join([t for t in tokens if valid_token(t)])
        
def stem(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = [stemmer.stem(t) for t in tokens]
    return ' '.join([t for t in tokens if valid_token(t)])

def empty_to_nan(sentence):
    if len(sentence) < 1:
        return np.nan
    else:
        return sentence

def clean_all(s):
    #s = s.lower()
    s = remove_links_and_html(s)
    s = remove_punct(s)
    s = remove_mentions(s)
    s = clean_stopwords(s)
    # stemming is slow on loads of data, consider uncommenting on big sets.
    #s = stem(s)
    # finally, make sure we have no empty texts
    s = empty_to_nan(s)
    return s


In [None]:
samples = glob.glob(os.path.join(os.getcwd(), 'SPLIT') + '/*')
labels = pd.read_json('../data/labels.ndjson', lines=True)
for sample in samples:
    csvname = 'SPLIT/' + sample[-6:] + '.csv'
    print(csvname)
    print(sample)
    df = pd.read_json(sample, lines=True)
    combined = pd.merge(df, labels, on='id')
    #combined = combined.drop(columns=['id'])  # no need for any non-label data
    combined['text'] = combined['text'].apply(merge_sents)
    combined = combined.dropna()
    print(combined.head())
    del df
    
    start = time.time()
    print('...cleaning')
    combined['text'] = combined['text'].str.lower()
    combined['text'] = combined['text'].apply(clean_all)
    end = time.time()
    print(end - start)

    # prune empty texts
    combined = combined.dropna()
    combined.to_csv(csvname, header=False, index=False)
    




SPLIT/feed00.csv
C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\SPLIT\feed00
      id                                               text  birthyear  \
0  22704  Back at it with @americanidol looking for...he...       1984   
1  46305  The last presidential election turned on fewer...       1961   
2  30260  Angels ðŸ˜‡ \n@RobbieWilliams\nhttps://t.co/A6...       1989   
3   4874  Listen to â€œShallowâ€�, â€œAlways Remember Us...       1986   
4  41392  So happy for my island! Vote for Madeira, for ...       1985   

        fame  gender occupation  
0  superstar  female  performer  
1  superstar    male   politics  
2  superstar  female  performer  
3  superstar  female    creator  
4  superstar    male    manager  
...cleaning
151.7535059452057
SPLIT/feed01.csv
C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\SPLIT\feed01
      id                                               text  birthyear  \
0  48084  Paul Ma