# 13.1 Bitly Data from 1.USA.gov

In [None]:
import pandas as pd
import numpy as np

In [None]:
path = './datasets/bitly_usagov/example.txt'

import json

with open(path) as f:
	records = [json.loads(line) for line in f]

In [None]:
records[0]

In [None]:
# Counting Time Zones in Pure Python

In [None]:
# Extract timezones using list comprehension

time_zones = [rec['tz'] for rec in records if 'tz' in rec]

time_zones[:10]

# Counts the timezone 
def get_counts(sequence):
	counts = {}
	for x in sequence:
		if x in counts:
			counts[x] += 1
		else:
			counts[x] = 1
	return counts

# Using more advanced python standard library
from collections import defaultdict

def get_counts2(sequence):
	counts = defaultdict(int) # Values will initialized to 0
	for x in sequence:
		counts[x] += 1
	return counts

counts = get_counts(time_zones)

In [None]:
# Return top n counts 
def top_counts(count_dict, n=10):
	value_key_pairs = [(count, tz) for tz, count, in count_dict.items()]
	value_key_pairs.sort()
	return value_key_pairs[-n:]

In [None]:
top_counts(get_counts(time_zones), 5)

In [None]:
# use Python standard library
from collections import Counter

counts = Counter(time_zones)
counts.most_common(10)

## Counting Time Zones with pandas

In [None]:
frame = pd.DataFrame(records)

In [None]:
frame['tz'].head()

In [None]:
tz_counts = frame['tz'].value_counts()

In [None]:
tz_counts.head()

In [None]:
# visualize the data using matplotlib

# make the plots looks nicer by filling a substitute value for unknown value

clean_tz = frame['tz'].fillna('Missing')

clean_tz[clean_tz == ""] = "Unknown"

tz_counts = clean_tz.value_counts()

In [None]:
import seaborn as sns

subset = tz_counts.head()

sns.barplot(y = subset.index, x = subset.to_numpy())

In [None]:
# Split off the first token in the string and make another summary of the user behavior

results = pd.Series([x.split()[0] for x in frame['a'].dropna()])

In [None]:
results.value_counts().head(8)

In [None]:
# Decompose the top timezones into Windows and non-Window users

# Exclude missing agents from the data
cframe = frame[frame['a'].notna()].copy()

cframe['os'] = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')

cframe['os'].head()

# Group the data by timezone column and list of os
by_tz_os = cframe.groupby(['tz', 'os'])

In [None]:
agg_counts = by_tz_os.size().unstack().fillna(0)

In [None]:
# select the top overall timezones

# Construct an indirect index array from the row counts in agg_counts
# After computing row counts with agg_counts.sum('columns'), use argosort() to obtain and index array
indexer = agg_counts.sum('columns').argsort()

In [None]:
# use `take` to select the rows in the order. Slice off the last 10 rows (largest values)
count_subset = agg_counts.take(indexer[-10:])

count_subset

# Use n-largest to achieve the same result
agg_counts.sum(axis='columns').nlargest(10)

In [None]:
# To plot in a grouped bar plot

# First stack and reset the index to rearrange the data for better compatibility 
count_subset = count_subset.unstack()

count_subset.name = 'total'

count_subset = count_subset.reset_index()

# sns.barplot(x = 'total', y = 'tz', hue = 'os', data=count_subset)

# Normalize the group percentage sum to 1

def normal_total(group):
	group['normed_total'] = group['total'] / group['total'].sum()
	return group

results = count_subset.groupby('tz').apply(normal_total)

sns.barplot(x = 'normed_total', y='tz', hue='os', data=results)

In [None]:
# Compute normalized sum more efficiently using trans from groupby method

g = count_subset.groupby('tz')
results2 = count_subset['total'] / g['total'].transform['sum']

In [None]:
count_subset

# 13.2 MovieLens 1M Dataset


In [None]:
unames = ["user_id", "gender", "age", "occupation", "zip"]

users = pd.read_table(
    "datasets/movielens/users.dat", sep="::", header=None, names=unames, engine="python"
)

rnames = ["user_id", "movie_id", "rating", "timestamp"]

ratings = pd.read_table(
    "datasets/movielens/ratings.dat",
    sep="::",
    header=None,
    names=rnames,
    engine="python",
)

mnames = ["movie_id", "title", "genres"]
movies = pd.read_table(
    "datasets/movielens/movies.dat",
    sep="::",
    header=None,
    names=mnames,
    engine="python",
)


In [None]:
users.head(5)

In [None]:
ratings.head()

In [None]:
# Merge 'ratings' with 'users' and then merge that result with 'movies' data
data = pd.merge(pd.merge(ratings, users), movies)

In [None]:
data.iloc[0]

In [None]:
# To get mean movie ratings for each film grouped by gender

mean_ratings = data.pivot_table('rating', index='title', columns='gender', aggfunc='mean')
mean_ratings.head()

In [None]:
# Filter down to movies that received at least 250 ratings

ratings_by_title = data.groupby('title').size()

active_titles = ratings_by_title.index[ratings_by_title >= 250]

In [None]:
active_titles

In [None]:
# Select rows from mean_ratings

mean_ratings = mean_ratings.loc[active_titles]

In [None]:
# To see the top films among female viewers, sort F column in descending order

top_female_ratings = mean_ratings.sort_values('F', ascending=False)

top_female_ratings.head()

## Measure Rating Disagreement
Find movies that are most divisive between male and female, add a column to mean_ratings contains the differences in means

In [None]:
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']

sorted_by_diff = mean_ratings.sort_values('diff')

In [None]:
sorted_by_diff.head()

In [None]:
# Find out movies that preferred by men that women didn't rated as highly
sorted_by_diff[::-1].head()

In [None]:
# get the movies that elicited the most disagreement among viewers

# Disagreement can be measured by the variance of standard deviation

rating_std_by_title = data.groupby('title')['rating'].std()
rating_std_by_title = rating_std_by_title.loc[active_titles]

# Sort in descrending order and select the first 10 rows
rating_std_by_title.sort_values(ascending=False)[:10]

In [None]:
# use `explode` method to group genres better
movies['genres'].head()

In [None]:
movies['genres'].head().str.split('|')

In [None]:
# Split the genres string into a list of genres 
movies['genre'] = movies.pop('genres').str.split('|')

# Calling `explode` method to generate a new DataFrame with one row for each 'inner' element
movies_exploded = movies.explode('genre')
movies_exploded[:10]

In [None]:
# Merge all three tables together and group by genre

rating_with_genre = pd.merge(pd.merge(movies_exploded, ratings),users)

In [None]:
genre_ratings = (rating_with_genre.groupby(['genre', 'age'])['rating'].mean().unstack('age'))
genre_ratings