# Used to clean blog data

In [None]:
# Created: 22 Feb 2019

In [1]:
import numpy as np
import pandas as pd

import random
import os
import xmltodict, json
import csv

import nltk
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer


In [22]:
# Functions
def convertXML(name):
    """
    Converts a blog to a list of dates and posts 
    
    name: blog title
    
    """
    
    with open ("data/blogs/{}".format(name), "r", encoding = "ISO-8859-1") as myfile:
        data=myfile.read().replace('\n', '')

    data = data.replace("<Blog><date>",'')
    data = data.replace("</post></Blog>",'')
    data = data.replace('#','')
    data = data.replace("</date><post>",'#')
    data = data.replace("</post><date>",'#')

    data = data.split('#')
    #print(data)
    data = np.array(data)
    
    return data

def writeToCSV(author,name,csv_path): 
    """
    Returns a list of strings of the for author,age,gender,date,post
    
    author: author id
    name: blog title 
    csv_path: path to csv
    """
    age = name.split(".")[2]
    gender = name.split(".")[1]

    data = convertXML(name)
    dates = data[::2]
    dates = [d.replace(',','/') for d in dates ]
    posts = data[1::2]
    with open(csv_path, 'a') as writeFile:
        csv.register_dialect('myDialect', delimiter = '#')
        writer = csv.writer(writeFile, dialect='myDialect')
        
        for i in range(len(dates)):
            #line = "#".join(map(str,[author,age,gender,dates[i],posts[i]]))
            row = [author,age,gender,dates[i],posts[i]]
            writer.writerow(row)
            
    writeFile.close()


### Write raw blog data to a csv

In [33]:
#Read in blog titles
titles = os.listdir("data/blogs/")
#titles = random.sample(titles, 1000)
print(len(titles))
print(titles[1])

19320
3489929.female.25.Student.Cancer.xml


In [34]:
csv_path = 'data/blogs_raw.csv'

#Initialize csv file
with open(csv_path, 'w') as writeFile:
        csv.register_dialect('myDialect', delimiter = '#')
        writer = csv.writer(writeFile, dialect='myDialect')
        row = ["author","age","gender","date","post"]
        writer.writerow(row)
            
writeFile.close()

#write blogs to csv
for i,t in enumerate(titles):
    writeToCSV(i,t,'data/blogs_raw.csv')
    if i%1000 == 0: print("Processed {}/19320 blogs".format(i))
    

Processed 0/19320 blogs
Processed 1000/19320 blogs
Processed 2000/19320 blogs
Processed 3000/19320 blogs
Processed 4000/19320 blogs
Processed 5000/19320 blogs
Processed 6000/19320 blogs
Processed 7000/19320 blogs
Processed 8000/19320 blogs
Processed 9000/19320 blogs
Processed 10000/19320 blogs
Processed 11000/19320 blogs
Processed 12000/19320 blogs
Processed 13000/19320 blogs
Processed 14000/19320 blogs
Processed 15000/19320 blogs
Processed 16000/19320 blogs
Processed 17000/19320 blogs
Processed 18000/19320 blogs
Processed 19000/19320 blogs


### Clean CSV post data <br>
Every post has been cleaned by:

<ul>
    <li> Removing extra spaces
    <li> Punctuation removed
    <li> Made lower case
    <li> Stemmed
</ul>

In [2]:
prevanlence = pd.read_csv("data/prevalence.csv")
prevanlence.head()

Unnamed: 0,Word,Pknown,Nobs,Prevalence,FreqZipfUS
0,a,0.98,438,1.917,7.309
1,aardvark,0.96,434,1.684,2.634
2,aardwolf,0.21,428,-0.788,1.292
3,abaca,0.24,396,-0.706,1.593
4,aback,0.86,343,1.077,2.496


In [3]:
file = pd.read_csv("data/blogs_raw.csv",sep="#")
print(len(file))
file.head()

681288


Unnamed: 0,author,age,gender,date,post
0,0,16,male,19/August/2004,\t DESTINY... you might not say...
1,0,16,male,17/August/2004,\t DEAR ANGEL.. you say it or yo...
2,0,16,male,16/August/2004,\t MAIN AUR MERI TANHAI (jagjeet singh) ...
3,0,16,male,14/August/2004,\t mail addressrs(s) urlLink http://red...
4,0,16,male,09/August/2004,\t RAP- ALLRISE so stand back cause u do...


In [9]:
with open("data/blogs_clean.csv", 'w') as writeFile:
    csv.register_dialect('myDialect', delimiter = '#')
    writer = csv.writer(writeFile, dialect='myDialect')
    row = ["author","age","gender","date","post"]
    writer.writerow(row)
            
writeFile.close()


In [10]:
with open('data/blogs_raw.csv') as read:  
    line = read.readline()
    print(line)
    line = read.readline()
    
    with open("data/blogs_clean.csv", 'a') as writeFile:
        csv.register_dialect('myDialect', delimiter = '#')
        writer = csv.writer(writeFile, dialect='myDialect')
    
        cnt = 1
        while line:
            line = line.strip()
            line = line.split("#")
            post = line[4]
            post = cleanPost(post)
            row = [line[0],line[1],line[2],line[3],post]
            writer.writerow(row)
            
            line = read.readline()
            cnt += 1
            if cnt%25000 == 0: print("Processed {}/681288 blogs".format(cnt))
                
        writeFile.close()
        
    read.close()
    

author#age#gender#date#post

Processed 25000/681288 blogs
Processed 50000/681288 blogs
Processed 75000/681288 blogs
Processed 100000/681288 blogs
Processed 125000/681288 blogs
Processed 150000/681288 blogs
Processed 175000/681288 blogs
Processed 200000/681288 blogs
Processed 225000/681288 blogs
Processed 250000/681288 blogs
Processed 275000/681288 blogs
Processed 300000/681288 blogs
Processed 325000/681288 blogs
Processed 350000/681288 blogs
Processed 375000/681288 blogs
Processed 400000/681288 blogs
Processed 425000/681288 blogs
Processed 450000/681288 blogs
Processed 475000/681288 blogs
Processed 500000/681288 blogs
Processed 525000/681288 blogs
Processed 550000/681288 blogs
Processed 575000/681288 blogs
Processed 600000/681288 blogs
Processed 625000/681288 blogs
Processed 650000/681288 blogs
Processed 675000/681288 blogs


In [6]:
def cleanPost(post):
    
    tokens = word_tokenize(post)
    
    tokens = [word for word in tokens if word.isalpha()] #remove punctuation
    tokens = [w.lower() for w in tokens] #Lower case

    porter = PorterStemmer()
    tokens = [porter.stem(w) for w in tokens] #Stemmed
    post_clean = " ".join(tokens)
    
    return post_clean
post = file['post'][15]
cleanPost(post)

'i make excel lasagna just ask ricotta monkey sorri lisa but that wa your onli contribut after all so much for my great idea to start diet thi weekend posh will kill me i promis to start so we can help each other but consid the fact that the girl is onli look to lose ten that she realli ca afford to i not feel too inclin to encourag her weight loss anyway i keep tell myself that when i get my gazel i will actual stick with it and use it everi day i been lust after one for a year so thi is a realist thought howev know myself i know that i will probabl be for a month and then store it under my bed never to be seen again la vie that me i want to watch tonight i saw it when it came out in theater but due to my bladder i miss a coupl of part of the movi same with harri potter three week ago i need to go see it again becaus of my useless kegel muscl my ball will go to wast of thi i am certain anyway i am wait for lisa to make up her mind as to when she want to start the movi she said earlier

In [8]:
data_clean = pd.read_csv('data/blogs_clean.csv')
print(len(data_clean))
data_clean

33383


Unnamed: 0,author#age#gender#date#post
0,0#16#male#19/August/2004#destini you might not...
1,0#16#male#17/August/2004#dear you say it or yo...
2,0#16#male#16/August/2004#main aur meri tanhai ...
3,0#16#male#14/August/2004#mail addressr s urlli...
4,0#16#male#09/August/2004#allris so stand back ...
5,0#16#male#09/August/2004#miss you badli i am l...
6,0#16#male#07/August/2004#hazel eye close your ...
7,0#16#male#07/August/2004#let it be me a bird h...
8,1#25#female#29/May/2004#it been a long time co...
9,1#25#female#30/June/2004#urllink


# Old code

In [185]:

def convertXML2(name):

    with open ("data/blogs/{}".format(name), "r",encoding = "ISO-8859-1") as myfile:
        data=myfile.read().replace('\n', '')

    obj = xmltodict.parse(data)
    blogs_json = json.dumps(obj)
    blogs_json = json.loads(blogs_json)

    date = blogs_json['Blog']['date']
    post = blogs_json['Blog']['post']

    for i in zip(date,post):
        print(i)
    

In [None]:
name = titles[1]
with open ("data/blogs/{}".format(name), "r") as myfile:
        data=myfile.read().replace('\n', '')
xml = '<root>' + data + '</root>'

obj = xmltodict.parse(xml)
blogs_json = json.dumps(obj)
blogs_json = json.loads(blogs_json)

#convertXML(xml)