In [2]:
import os
import csv
import pandas as pd
import numpy as np
import glob

In [3]:
final_df = None
if not os.path.isfile('expanded.csv'):
    samples = glob.glob(os.path.join(os.getcwd(), 'samples') + '/*.ndjson')

    dfs = []
    for sample in samples:
        print(sample)
        df = pd.read_json(sample, lines=True)
        dfs.append(df)

    final_df = pd.concat(dfs, axis=0, ignore_index=True)
    labels = pd.read_json('../data/labels.ndjson', lines=True)
    combined = pd.merge(final_df, labels, on='id')
    expanded = combined.set_index(
        ['id', 'birthyear', 'fame', 'gender', 'occupation']
    )['text'].apply(pd.Series).stack()
    expanded = expanded.reset_index()
    expanded = expanded.drop(columns=['level_5'])  # level_5 is the auto-generated new column, containing an index

    expanded.to_csv('expanded.csv')


C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\samples\feed00-sample.ndjson
C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\samples\feed01-sample.ndjson
C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\samples\feed02-sample.ndjson
C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\samples\feed03-sample.ndjson
C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\samples\feed04-sample.ndjson
C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\samples\feed05-sample.ndjson
C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\samples\feed06-sample.ndjson


In [4]:
df.head()

Unnamed: 0,id,text
0,12587,"[RT @EdinburghJudo: Friday Squad girls, provid..."
1,2306,[An elected Matthew Guy Liberal Nationals Gove...
2,19951,[YAAAAASSS this is awesome. Over here in Michi...
3,13084,[RT @EsquireClassic: Mortal Combat: https://t....
4,22726,[How often does our brain focus on whats wrong...


In [3]:
if not final_df:  # make sure we didn't write to a csv
    data_path = os.path.join(os.getcwd(), 'expanded.csv')
    data_raw = pd.read_csv(data_path)
    #data_raw = pd.read_csv(open(data_path,'r'), encoding='utf-8', engine='c')
    data_raw.columns = ['index', 'id', 'birthyear', 'fame', 'gender', 'occupation', 'text']
    data_raw = data_raw.drop(columns=['index', 'id'])  # no need for any non-label data

    print(data_raw.shape)  # before dropping NaN values

    data_raw = data_raw.dropna()
    data_raw['birthyear'] = data_raw['birthyear'].astype(int)  # from 1978.0 -> 1978
    print(data_raw.shape)


(15962300, 5)
(15962300, 5)


In [8]:
data_raw.head()
data = data_raw

In [5]:
# preprocess!
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import WhitespaceTokenizer

tokenizer = WhitespaceTokenizer()
stop_words = set(stopwords.words('english'))
stop_words.update(['rt'])  # remove the retweet tag!

stemmer = SnowballStemmer("english")

import re

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [6]:
def remove_links_and_html(sentence):
    sentence = re.sub(r'http\S+', '', sentence)
    sentence = re.sub(r'<[^<]+?>', '', sentence)

    return sentence

def remove_punct(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

def remove_mentions(sentence):
    return re.sub(r'@#?\b\w\w+\b', '', sentence)

def valid_token(tok):
    if '#' in tok:
        # make sure the hashtag is alphanumeric (avoiding arabic etc)
        return re.sub('[^0-9a-zA-Z]+', '', tok) != ''
    non_stop = tok not in stop_words
    no_rt = 'rt' not in tok
    is_latin = re.sub('[^0-9a-zA-Z]+', '', tok) == tok
    return is_latin and non_stop

def clean_stopwords(sentence):
    tokens = tokenizer.tokenize(sentence)
    return ' '.join([t for t in tokens if valid_token(t)])
        
def stem(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = [stemmer.stem(t) for t in tokens]
    return ' '.join([t for t in tokens if valid_token(t)])

def empty_to_nan(sentence):
    if len(sentence) < 1:
        return np.nan
    else:
        return sentence

def clean_all(s):
    #s = s.lower()
    s = remove_links_and_html(s)
    s = remove_punct(s)
    s = remove_mentions(s)
    s = clean_stopwords(s)
    # stemming is slow on loads of data, consider uncommenting on big sets.
    #s = stem(s)
    # finally, make sure we have no empty texts
    s = empty_to_nan(s)
    return s


In [9]:
import time

start = time.time()

data['text'] = data['text'].str.lower()
data['text'] = data['text'].apply(clean_all)
# run time: around 3-4 minutes per 1 million texts
end = time.time()
print(end - start)

# prune empty texts
data = data.dropna()
data.head(30)

606.405600309372


Unnamed: 0,birthyear,fame,gender,occupation,text
0,1984,superstar,female,performer,back looking circa early coeur dalene idaho
1,1984,superstar,female,performer,buy happiness world state mind
2,1984,superstar,female,performer,30
3,1984,superstar,female,performer,new york new york
4,1984,superstar,female,performer,pink love first sight new york new york
5,1984,superstar,female,performer,putting best foot forward memphis #ffanyâ€™s 2...
6,1984,superstar,female,performer,girls united never divided
7,1984,superstar,female,performer,brb buying stephanie bow shoe early christmas ...
8,1984,superstar,female,performer,connect dots bbs #theclara teamkp #katyperryonqvc
9,1984,superstar,female,performer,geometry favorite subject school #octagonheel ...


In [10]:
data.shape

(15273740, 5)

In [39]:
x = data['birthyear'] == 1941
y = data[x][['text','birthyear']]
y.head()

Unnamed: 0,text,birthyear
356947,1981 first election become mayor burlington 10...,1941
356948,climate change single greatest threat facing p...,1941
356949,important amazon recognize workers rights stop...,1941
356950,america wealth income inequality major develop...,1941
356951,south join saturday #medicareforall rally rsvp,1941


In [41]:
def mkdir(path):
    if not os.path.exists(path):
        os.makedirs(path)
    else:
        print('folder exists: {}'.format(path))

# the folder to hold the datasets (csv) with [text, label]
single_label_dir = os.path.join(os.getcwd(), 'single-label')
mkdir(single_label_dir)

categories = list(data.columns.values)
categories = categories[:-1]
print(categories)

for categ in categories:
    print(categ)
    vals = sorted(data[categ].unique())
    print(vals)
    # create a folder for each category
    categ_path = os.path.join(single_label_dir, categ)
    mkdir(categ_path)
    # store each corresponding dataframe in respective category folders        
    for val in vals:
        #print('{}: {}'.format(val, type(val)))
        condition = data[categ] == val
        tmp_df = data[condition][['text', categ]]  # this extracts the text and column
        save_path = categ_path + '\\' + str(val) + '.csv'
        print('storing {}'.format(save_path))
        tmp_df.to_csv(save_path)
        del tmp_df


folder exists: C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\single-label
['birthyear', 'fame', 'gender', 'occupation']
birthyear
[1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008]
folder exists: C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\single-label\birthyear
storing C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\single-label\birthyear\1940.csv
storing C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\single-label\birthyear\1941.csv
storing C:\Users\Tollef\Desktop\Spring2019\TextAn

storing C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\single-label\birthyear\2000.csv
storing C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\single-label\birthyear\2001.csv
storing C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\single-label\birthyear\2002.csv
storing C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\single-label\birthyear\2003.csv
storing C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\single-label\birthyear\2004.csv
storing C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\single-label\birthyear\2005.csv
storing C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\single-label\birthyear\2007.csv
storing C:\Users\Tollef\Desktop\Spring2019\TextAnalysis\project\PAN-celebrity-profiling\data\single-label\birthyear\2008.csv
