In [1]:
import os
import pandas as pd
import urllib.parse
from ua_parser import user_agent_parser
from MojoNewsletterClicksParser import MojoNewsletterClicksParser

# to show the whole content in the columns
pd.set_option('display.max_colwidth', -1)

# define the folder that contains the newsletter csv files
datafolder = '/home/centos/mojo/data/click_data_v2/'
datafiles = [os.path.join(datafolder, f) for f in os.listdir(datafolder)]

# read all the csv files into one data frame
clean_df = []
for f in datafiles:
      dat = pd.read_csv(f)
      clean_df.append(dat)

all_dat = pd.concat(clean_df)

# change column names to match the names used in the module MojoNewsletterClicksParser
all_dat.rename(columns={'url':'Url','email hashed':'Email'}, inplace=True)
all_dat.head()

%time

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.68 µs


Unnamed: 0,Url,Email
0,http://www.wnyc.org/shows/heresthething,8a5270412c0e306bbe9e3cef6f12e1ed574e3cabc171dab60a04b6f6e45d127bb5c120a9ccb9de0b1264d9415133bb54f28de5af5bb9cb187e9a11bbf29c1bf9
1,http://www.motherjones.com/kevin-drum/2017/03/private-it-turns-out-trump-pretty-much-same,1544e977bc4fa3937d392baccfb2a9a76acca77295bdb758c76e2fd023a616d375a9f566faa0be88b53bba83b3f16db86dd7491622e88e7018e6f1f1e6d27c8a
2,http://www.motherjones.com/kevin-drum/2017/03/private-it-turns-out-trump-pretty-much-same,6a673f834dee910ffe02c27843a513c2cb988882b553cafeff72a913d52cc1a1fabbbfe3f87368c7b7e5f8642231f23a571950afa8e12263701ef1666c112eb0
3,http://www.motherjones.com/environment/2017/03/wildfires-rip-through-heart-climate-denier-country,d97d32356181dae19d0c3844fc7efa6edd7daa6ca9bad3e5e9a545adb87631981cc8f18e336aa07b04c3b143dade152267b4ed10bd3b441bbaef5382ddc0cb53
4,http://www.motherjones.com/politics/2017/03/exhaustive-history-donald-trump-russia-scandal-timeline,159c6bb7af59369960c8ff9b5449b6f2d846f0aceb6859f28b00b90b777dbbc7488e39221853dbbed3d9fee47b78d83b61a6f95ce122111413532d603bc510f2


In [2]:
# create an instance of the class MojoNewsletterClicksParser
nl = MojoNewsletterClicksParser(url_df=all_dat)

# Parse Url, Browser and Recorded On and add parsed info to the original data frame
full_url_df = nl.extend_url_df( nl.url_df,
                                by_date=False,
                                by_url=True,
                                by_domain_type=True,
                                by_ua=False)
full_url_df.to_pickle('full_url_df_v2.pkl')

Add protocol, domain, path, query and fragment
                  to the click data
Add domain_type
                  to the click data


In [3]:
full_url_df = pd.read_pickle('full_url_df_v2.pkl')

In [3]:
# subset data frame to records from standard mojo links only
# further parse url parts to get topic, title
nl.cleaned_mojo_standard=nl.mojo_standard_parser(full_url_df,selected_cols=['Email',
                                                                            'Url',
                                                                            'domain',
                                                                            'domain_type',
                                                                            'topic',
                                                                            'title'])
nl.cleaned_mojo_standard.head()

subset by domain type mojo_standard: selected 7913804 records out of 10030416 records
Add topic, title to the click data
Add utm_campaign, utm_medium, utm_source to the click data


Unnamed: 0,Email,Url,domain,domain_type,topic,title
1,1544e977bc4fa3937d392baccfb2a9a76acca77295bdb758c76e2fd023a616d375a9f566faa0be88b53bba83b3f16db86dd7491622e88e7018e6f1f1e6d27c8a,http://www.motherjones.com/kevin-drum/2017/03/private-it-turns-out-trump-pretty-much-same,www.motherjones.com,mojo_standard,kevin-drum,private-it-turns-out-trump-pretty-much-same
2,6a673f834dee910ffe02c27843a513c2cb988882b553cafeff72a913d52cc1a1fabbbfe3f87368c7b7e5f8642231f23a571950afa8e12263701ef1666c112eb0,http://www.motherjones.com/kevin-drum/2017/03/private-it-turns-out-trump-pretty-much-same,www.motherjones.com,mojo_standard,kevin-drum,private-it-turns-out-trump-pretty-much-same
3,d97d32356181dae19d0c3844fc7efa6edd7daa6ca9bad3e5e9a545adb87631981cc8f18e336aa07b04c3b143dade152267b4ed10bd3b441bbaef5382ddc0cb53,http://www.motherjones.com/environment/2017/03/wildfires-rip-through-heart-climate-denier-country,www.motherjones.com,mojo_standard,environment,wildfires-rip-through-heart-climate-denier-country
4,159c6bb7af59369960c8ff9b5449b6f2d846f0aceb6859f28b00b90b777dbbc7488e39221853dbbed3d9fee47b78d83b61a6f95ce122111413532d603bc510f2,http://www.motherjones.com/politics/2017/03/exhaustive-history-donald-trump-russia-scandal-timeline,www.motherjones.com,mojo_standard,politics,exhaustive-history-donald-trump-russia-scandal-timeline
5,159c6bb7af59369960c8ff9b5449b6f2d846f0aceb6859f28b00b90b777dbbc7488e39221853dbbed3d9fee47b78d83b61a6f95ce122111413532d603bc510f2,http://www.motherjones.com/media/2017/03/breitbart-sleeping-giants-ads,www.motherjones.com,mojo_standard,media,breitbart-sleeping-giants-ads


In [4]:
# subset data frame to records from other sources like facebook, twitter, etc
# further parse url parts to get topic, title
nl.cleaned_other=nl.others_parser(full_url_df,selected_cols=['Email',
                                                            'Url',
                                                            'domain',
                                                            'domain_type',
                                                            'topic',
                                                            'title'])
nl.cleaned_other.head()

subset by domain type others: selected 1794310 records out of 10030416 records
Add topic, title, utm_campaign, utm_medium, utm_source to the click data


Unnamed: 0,Email,Url,domain,domain_type,topic,title
0,8a5270412c0e306bbe9e3cef6f12e1ed574e3cabc171dab60a04b6f6e45d127bb5c120a9ccb9de0b1264d9415133bb54f28de5af5bb9cb187e9a11bbf29c1bf9,http://www.wnyc.org/shows/heresthething,www.wnyc.org,others,,
27,3cd3d2b7f495e3f0833e5ecd3f3e5534fb4d481b5f7ac3ec4b9807259324bae9ede678af1a8695d93e29f5ac9ea18c962cfe8d31485d0e2097c7ab8c305c821a,http://twitter.com/intent/tweet?url=http://www.motherjones.com/kevin-drum/2017/03/private-it-turns-out-trump-pretty-much-same&text=The+New+York+Times+Published+a+Report+Showing+That+Trump+Acts+as+Bad+in+Private+as+He+Does+in+Public&via=MotherJones,twitter.com,others,kevin-drum,private-it-turns-out-trump-pretty-much-same
30,3cd3d2b7f495e3f0833e5ecd3f3e5534fb4d481b5f7ac3ec4b9807259324bae9ede678af1a8695d93e29f5ac9ea18c962cfe8d31485d0e2097c7ab8c305c821a,http://www.facebook.com/sharer/sharer.php?u=http://www.motherjones.com/kevin-drum/2017/03/private-it-turns-out-trump-pretty-much-same,www.facebook.com,others,kevin-drum,private-it-turns-out-trump-pretty-much-same
100,913ba214d3781d9e5e5f588cbc6656eaf7b07f383005d47a1d9bb6f5a80358452eeabd38cedc44a16bdf82224b7eb764437cee5a16d6ab0f950cddd6a2f681bb,http://us.macmillan.com/static/smp/tears-we-cannot-stop/?utm_source=motherjones&utm_medium=adbox&utm_term=na-learnmore&utm_content=300x250-learnmore-buynow&utm_campaign=9781250135995,us.macmillan.com,others,,tears-we-cannot-stop
186,339131153c5ab1cea73ff83bf4d4f1f3283f3457bf0bab2503e3b23d80617169bc0630098a34c33f0e9a0ac64eca8ef8843bd7f8c22b821e831e3e82d5df365b,http://www.facebook.com/sharer/sharer.php?u=http://www.motherjones.com/kevin-drum/2017/03/private-it-turns-out-trump-pretty-much-same,www.facebook.com,others,kevin-drum,private-it-turns-out-trump-pretty-much-same


In [5]:
# subset data frame to records from non standard mojo links only; like li.motherjones, secure.motherjones, etc
nl.cleaned_mojo_nonstandard=nl.mojo_nonstandard_parser(full_url_df,selected_cols=[
                                                                                'Email',
                                                                                'domain',
                                                                                'domain_type'])
nl.cleaned_mojo_nonstandard.head()

subset by domain type mojo_internal,mojo_image,mojo_secure: selected 307674 records out of 10030416 records


Unnamed: 0,Email,domain,domain_type
48,708aef7cac23a7864ba15c9344112f792aa221cc0e20b8f1248be9047b76843d0483b8125ed1a14bd137fff4f7286978eab245789b7bcede5778397ff303d2f0,li.motherjones.com,mojo_internal
79,f6a1eaf01cf63d6a4a671013617ca3797d78883dd7c27bb918be3d021db30ade18a29d01799c5c4a86416f8dbbf482aae9d50502174d16af4cb045fb2ea8dcf1,li.motherjones.com,mojo_internal
80,f6a1eaf01cf63d6a4a671013617ca3797d78883dd7c27bb918be3d021db30ade18a29d01799c5c4a86416f8dbbf482aae9d50502174d16af4cb045fb2ea8dcf1,li.motherjones.com,mojo_internal
98,b6b69d1e43f6e363519954e11e3b5c39fa3647861a2fb1715ebf87bd5c1a9e932e5e8a7610c1ac03ab4de7613027a1190c0006bd11631709ef5b5da8fe0e332b,li.motherjones.com,mojo_internal
136,a3213a78c4e0794e1c718bc69b03fddb50115c467ae1232f4dbec5305b402c151e3225d30bc85845ab9eb45779f59436bba519cd2f301b65250cb8c6757f6d45,li.motherjones.com,mojo_internal


In [None]:
combined_clean_data = pd.concat([nl.cleaned_mojo_standard, nl.cleaned_other])
combined_clean_data.to_pickle('newsletter_processed_data_v2.pkl')