In [1]:
import os
import csv
import pandas as pd
import numpy as np
import glob

In [2]:
train_csv = 'combined.csv'
test_csv = 'combined-test.csv'

train_path = 'samples'
test_path = 'samples-test'

# modify this shit if you want to train :)
test = True

if test:
    _csv = test_csv
    _path = test_path
else:
    _csv = train_csv
    _path = train_path

In [3]:
final_df = None
if not os.path.isfile(_csv):
    samples = glob.glob(os.path.join(os.getcwd(), _path) + '/*')

    dfs = []
    for sample in samples:
        print(sample)
        df = pd.read_json(sample, lines=True)
        dfs.append(df)

    final_df = pd.concat(dfs, axis=0, ignore_index=True)
    labels = pd.read_json('../data/labels.ndjson', lines=True)
    combined = pd.merge(final_df, labels, on='id')
    
    def merge_sents(sent):
        return '. '.join(sent)
    
    combined['text'] = combined['text'].apply(merge_sents)
    combined.to_csv(_csv)

In [4]:
if not final_df:  # make sure we didn't write to a csv
    data_path = os.path.join(os.getcwd(), _csv)
    data_raw = pd.read_csv(data_path)
    #data_raw = pd.read_csv(open(data_path,'r'), encoding='utf-8', engine='c')
    data_raw.columns = ['index', 'id', 'text', 'birthyear', 'fame', 'gender', 'occupation']
    data_raw = data_raw.drop(columns=['index', 'id'])  # no need for any non-label data

    print(data_raw.shape)  # before dropping NaN values

    data_raw = data_raw.dropna()
    #data_raw['birthyear'] = data_raw['birthyear'].astype(int)  # from 1978.0 -> 1978
    print(data_raw.shape)


(3500, 5)
(3500, 5)


In [5]:
data_raw.head()
data = data_raw

In [6]:
# preprocess!
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import WhitespaceTokenizer

tokenizer = WhitespaceTokenizer()
stop_words = set(stopwords.words('english'))
stop_words.update(['rt'])  # remove the retweet tag!

stemmer = SnowballStemmer("english")

import re

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [7]:
def remove_links_and_html(sentence):
    sentence = re.sub(r'http\S+', '', sentence)
    sentence = re.sub(r'<[^<]+?>', '', sentence)

    return sentence

def remove_punct(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

def remove_mentions(sentence):
    # keep the @ to check for mentions among separate groups
    return re.sub(r'@#?\b\w\w+\b', '@', sentence)

def valid_token(tok):
    if '#' in tok:
        # make sure the hashtag is alphanumeric (avoiding arabic etc)
        return re.sub('[^0-9a-zA-Z]+', '', tok) != ''
    non_stop = tok not in stop_words
    no_rt = 'rt' not in tok
    is_latin = re.sub('[^0-9a-zA-Z]+', '', tok) == tok
    return is_latin and non_stop

def clean_stopwords(sentence):
    tokens = tokenizer.tokenize(sentence)
    return ' '.join([t for t in tokens if valid_token(t)])
        
def stem(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = [stemmer.stem(t) for t in tokens]
    return ' '.join([t for t in tokens if valid_token(t)])

def empty_to_nan(sentence):
    if len(sentence) < 1:
        return np.nan
    else:
        return sentence

def clean_all(s):
    #s = s.lower()
    s = remove_links_and_html(s)
    s = remove_punct(s)
    s = remove_mentions(s)
    s = clean_stopwords(s)
    # stemming is slow on loads of data, consider uncommenting on big sets.
    #s = stem(s)
    # finally, make sure we have no empty texts
    s = empty_to_nan(s)
    return s


In [8]:
import time

start = time.time()

data['text'] = data['text'].str.lower()
data['text'] = data['text'].apply(clean_all)
# run time: around 3-4 minutes per 1 million texts
end = time.time()
print(end - start)

# prune empty texts
data = data.dropna()
data.head(30)

211.90762305259705


Unnamed: 0,text,birthyear,fame,gender,occupation
0,awwww thank much best year equipped ive advent...,1989,superstar,female,performer
1,tell secrets dog never heard dog talk beloved ...,1988,superstar,male,performer
2,backatcha thanks known gonna live long taken b...,1958,superstar,male,sports
3,laugh loud friends never seen gets every time ...,1992,superstar,female,sports
4,heartbreaking #ripmacmiller hours still answer...,1994,superstar,female,performer
5,yes last night #nkotbcruisex #repost yesterday...,1976,superstar,female,performer
6,happy world straight edge day given day usuall...,1989,superstar,female,sports
7,hard think week ago strolling streets palma he...,1980,superstar,female,sports
8,gazetecilik tam da budur gazeteci diye dalkavu...,1972,superstar,male,science
9,made proud bruins 2019 schedule mark calendars...,1996,superstar,female,sports


In [9]:
data.shape

(3499, 5)

In [10]:
cleaned_csv = 'cleaned-' + _csv
data.to_csv(cleaned_csv)

In [None]:
'''
def mkdir(path):
    if not os.path.exists(path):
        os.makedirs(path)
    else:
        print('folder exists: {}'.format(path))

# the folder to hold the datasets (csv) with [text, label]
single_label_dir = os.path.join(os.getcwd(), 'single-labeled-combined-text')
mkdir(single_label_dir)

categories = list(data.columns.values)
categories = categories[1:]
print(categories)
'''

In [None]:
'''
for categ in categories:
    print(categ)
    vals = sorted(data[categ].unique())
    print(vals)
    # create a folder for each category
    categ_path = os.path.join(single_label_dir, categ)
    mkdir(categ_path)
    # store each corresponding dataframe in respective category folders        
    for val in vals:
        #print('{}: {}'.format(val, type(val)))
        condition = data[categ] == val
        tmp_df = data[condition][['text', categ]]  # this extracts the text and column
        save_path = categ_path + '\\' + str(val) + '.csv'
        print('storing {}'.format(save_path))
        tmp_df.to_csv(save_path)
        del tmp_df
'''