# Age vs Word-Prevalence Metrices 

In [1]:
# Algorithm/Steps:
# step-1 :: Read blog authorship corpora 
#
# step-2 :: Clean blog authorship corpora
#          (a) Remove xml tags such as <Blog>, <post>, <date>
#          (b) Remove newline and tab characters
#          (c) Tokenize blog text
#          (d) Convert to lower-case
#          (e) Remove punctuation
#          (f) Remove non-alphabetic tokens
#          (g) Remove stopwords
#          (h) Convert to pandas dataframe for further processing
#
# step-3 :: Read word-prevalnce dataset
#

### Essential imports

In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import pickle

from nltk.corpus import stopwords
nltk.download('stopwords')

import os
import string

[nltk_data] Downloading package stopwords to /Users/tofii/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Functions to read and prepare the blogs-authorship corpora

In [3]:
def read_file(filename):
    with open(filename,'r', encoding = 'ISO-8859-1') as f:
        data = f.read()
    return data
    
def clean_data(data):    
    ''' 
    Function to clean the data.
    '''
    # Remove XML tags
    data = data.replace('<Blog>','')
    data = data.replace('</Blog>','')
    data = data.replace('<post>','')
    data = data.replace('</post>','')
    data = data.replace('<date>','')
    data = data.replace('</date>','')
    
    # Remove newline and tab characters
    data = data.replace('\n','')
    data = data.replace('\t','')
    
    # Tokenize text with nltk
    tokens = nltk.word_tokenize(data)
    
    # Convert to lower case
    tokens = [w.lower() for w in tokens]
    
    # Remove punctuations
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]

    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    
    return words

### Prepare Corpora

In [67]:
files = os.listdir('blogs_data')
posts = []
header = ['ID','GENDER','AGE','INDUSTRY','ASTROLOGICAL SIGN','POST']

counter = 0
for file in files:
    #print(file)
    attribs = file.split('.')[0:-1]
    
    # Read file
    post = read_file('blogs_data/'+file)
    
    # Clean post
    cleaned_post = clean_data(post)
    
    # Add post to attribs list
    attribs.append(cleaned_post)
    
    # Append post and other data to posts list
    posts.append(attribs)
    
    counter += 1
    if counter % 100 == 0:
        print('{}/{} blogs processed'.format(counter, len(files)))
# Prepare pandas dataframe
posts_df = pd.DataFrame(posts, columns=header)

# Print sample
posts_df.head()

100/19320 blogs processed
200/19320 blogs processed
300/19320 blogs processed
400/19320 blogs processed
500/19320 blogs processed
600/19320 blogs processed
700/19320 blogs processed
800/19320 blogs processed
900/19320 blogs processed
1000/19320 blogs processed
1100/19320 blogs processed
1200/19320 blogs processed
1300/19320 blogs processed
1400/19320 blogs processed
1500/19320 blogs processed
1600/19320 blogs processed
1700/19320 blogs processed
1800/19320 blogs processed
1900/19320 blogs processed
2000/19320 blogs processed
2100/19320 blogs processed
2200/19320 blogs processed
2300/19320 blogs processed
2400/19320 blogs processed
2500/19320 blogs processed
2600/19320 blogs processed
2700/19320 blogs processed
2800/19320 blogs processed
2900/19320 blogs processed
3000/19320 blogs processed
3100/19320 blogs processed
3200/19320 blogs processed
3300/19320 blogs processed
3400/19320 blogs processed
3500/19320 blogs processed
3600/19320 blogs processed
3700/19320 blogs processed
3800/19320

Unnamed: 0,ID,GENDER,AGE,INDUSTRY,ASTROLOGICAL SIGN,POST
0,4162441,male,16,Student,Sagittarius,"[destiny, might, say, anything, hear, chosen, ..."
1,3489929,female,25,Student,Cancer,"[long, time, coming, made, serious, decisions,..."
2,3954575,female,23,BusinessServices,Gemini,"[sit, work, three, hours, left, guess, bad, ti..."
3,3364931,male,16,Student,Virgo,"[today, normal, nothing, much, talk, except, g..."
4,3162067,female,24,Education,Cancer,"[feel, water, crystal, vibrations, mother, bea..."


### Read word-prevalence corpora

In [68]:
# Read data as pandas dataframe
prevalence_df = pd.read_csv("preval_data/prevalence.csv")

# print sample data
prevalence_df.head()

Unnamed: 0,Word,Pknown,Nobs,Prevalence,FreqZipfUS
0,a,0.98,438,1.917,7.309
1,aardvark,0.96,434,1.684,2.634
2,aardwolf,0.21,428,-0.788,1.292
3,abaca,0.24,396,-0.706,1.593
4,aback,0.86,343,1.077,2.496


In [69]:
def append_to_file(row):
    with open('count_matrix.csv','a') as f:
        f.writelines(','.join(row))
        f.write('\n')

In [70]:
header = posts_df.columns.values.tolist()
header.remove('POST')
for word in prevalence_df.Word:
    if isinstance(word, str) and word not in stop_words:
        header.append(word)
append_to_file(header)

### Creation of count matrix

In [71]:
stop_words = set(stopwords.words('english'))

for idx, row in posts_df.iterrows():
    count_list = row[0:5].values.tolist()
    post = row[5]
    for word in prevalence_df.Word:
        if isinstance(word, str) and word not in stop_words:
            cnt = post.count(word)
            count_list.append(str(cnt))
    append_to_file(count_list)