Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
449 lines (335 sloc) 13.9 KB
# -*- encoding: utf-8
from __future__ import division
"""
From among the links published by Facebook users pick up the links
which differentiate users the most (by age, gender, location etc.).
The input data is a sparse matrix. A number of first columns provide
factors data (such as age, gender etc.), and the following columns
provide features (URLs). Rows depict users.
"""
import sys
import gzip
import msgpack
import os.path
from itertools import combinations
from textwrap import wrap
import numpy as np
import pandas as pd
import scipy.io as sio
import scipy.sparse as sparse
import matplotlib.pyplot as plt
import brewer2mpl
from pylab import get_cmap
from utils.utils import Utils
from lib.examine_sparse_db import ExamineSparseDB
location_codes = {
#0: u'b/d',
1: u'500 tys.+',
2: u'400-500 tys.',
3: u'300-400 tys.',
4: u'200-300 tys.',
5: u'100-200 tys.',
6: u'< 100 tys.',
7: u'Wieś'
}
rs_codes = {
0: u'Zaręczony/-a',
1: u'To skomplikowane',
#2: 'none',
3: u'Wolny/-a',
4: u'W związku',
5: u'Małżeństwo',
6: u'Otwarty związek',
#7: 'in a domestic partnership',
#8: 'divorced',
#9: 'widowed',
#10: 'in a civil union',
#11: u'Separacja'
}
rs_codes = [i[1] for i in sorted(rs_codes.items(), key=lambda x: x[0])]
class ProcessMostDistinctive(ExamineSparseDB):
"""
"""
def __init__(self,
matrix_fn=None,
colnames_fn=None,
factors_fn=None,
verbose=1):
"""
Args:
matrix_fn (str): *.mtx file containing sparse data matrix
colnames_fn (str): column names for columns in 'matrix_fn'
factors (tuple): names of columns contaning factors (such as age).
The remaining columns are considered features.
verbose (int): 0 - shut up; 1 - explain.
"""
self._matrix_fn = matrix_fn
self._colnames_fn = colnames_fn
self._factors_fn = factors_fn
self.factors = None
self.fac_len = 0
self.verbose = verbose
self.data = None
self.df = None
self.categorized = None
self.init()
def _reset(self):
self.categorized = None
def _gen_colors(self, num):
if num < 13:
return brewer2mpl.get_map('Set3', 'Qualitative', num).mpl_colors
else:
cm = get_cmap('Dark2')
return [cm(1.*i/num) for i in range(num)]
def _print(self, astr):
if self.verbose > 0:
print astr
def groupby(self, factor,
min_cum_val=10,
drop_factor_vals=None,
categorical=0,
drop_features=None,
drop_rows=None):
"""
Group by factor 'factor'.
Args:
factor (str) - a factor by which the samples will be grouped.
min_cum_val (int) - drop features for which the cumulative
value for largest factor's value is smaller
than 'min_cum_val'.
In other words, if for a given feature (qwe.com)
none of factor's categories (eg. male/female for
factor 'gender') is larger than 'min_cum_val',
then drop this feature from further analysis.
drop_factor_vals (tuple) - drop particular values of the factor
(such as 0 values for factor 'age'
indicating no age for particular user).
categorical (int) - discretize continuous values of a factor.
The value of 'categorical' is a number of bins.
drop_features (tuple) - a tuple of feature names which should be
dropped from analysis
drop_rows (tuple of tuples) - drop rows by values of a particular
factor. Factor 'is_full_feed' takes
two values (0 and 1). To drop rows
with zeros in this factor the drop_rows
should be: (('is_full_feed', 0),).
Sets:
self.df to pandas's DataFrame object.
Returns:
returns pandas's DataFrame object.
"""
assert factor in self.col_names, "Factor %s does not exist!" % (factor)
fac_len, rows_len = self.fac_len, self.data.shape[0]
fac_ind = self.col_names.index(factor)
results = {}
target_stats = {}
m_csr = self.data.tocsr()
if categorical:
m_csc = self.data.tocsc()
f_col = np.ravel(m_csc.getcol(fac_ind).todense())
f_col_df = pd.DataFrame(f_col.tolist())
self.categorized = pd.cut(f_col_df.ix[:, 0], categorical)
for row_ind in range(rows_len):
arow = np.ravel(m_csr.getrow(row_ind).todense())
drop = False
if drop_rows:
for drop_factor, val_to_drop in drop_rows:
drop_ind = self.col_names.index(drop_factor)
if arow[drop_ind] == val_to_drop:
drop = True
if drop:
continue
factor_val = int(arow[fac_ind])
if drop_factor_vals and factor_val in drop_factor_vals:
continue
if categorical:
factor_val = self.categorized.labels[row_ind]
if factor_val in results:
results[factor_val] += arow[fac_len:] # element-wise addition
target_stats[factor_val] += 1
else:
results[factor_val] = arow[fac_len:]
target_stats[factor_val] = 1
df = pd.DataFrame(results).T
# URLs as column names
df.columns = self.col_names[fac_len:]
df.insert(0, self.col_names[fac_ind], df.index.tolist())
df.insert(1, 'users_count', [target_stats[i] for i in df.index.tolist()])
# index from 1 to max num of rows
df.index = range(1, df.shape[0]+1)
if drop_features:
df = df.drop(list(drop_features), axis=1)
self._print(df.ix[:, :2])
self.df = df
return df
def _prepare_chart_data(self):
assert self.df is not None, "No data in 'df'. Run 'groupby()' first!"
# count percentages for features
df2 = self.df.ix[:, :2] # keep factors unchanged
df2 = pd.concat([df2,
self.df.ix[:, 2:].div(self.df.loc[:, 'users_count'],
axis=0)],
axis=1) # concatenate factors and features' percentages
# count dispersion for every feature as standard deviation
df3 = df2.ix[:, 2:].std(axis=0)
df3.sort()
self._print('Factor: {}'.format(df2.columns[0]))
toplot = []
for seq_ind in df3.index.tolist()[-9:]:
# categories of a factor
categories = df2.ix[:, 0]
# percentage values for a feature indexed as seq_ind
pct = [i*100 for i in df2.ix[:, seq_ind].values]
# counter of rows ascribed to a given category
ucount = [i for i in self.df.ix[:, 'users_count'].values]
toplot.append((seq_ind, zip(categories, pct, ucount)))
# print some stats
self._print(' feature: {}'.format(toplot[-1][0]))
for cat, val, u_count in toplot[-1][1]:
self._print(' category: {}; value: {}; N={}'.format (cat,
val,
u_count))
self._print('-'*80)
return toplot
def _autolabel(self, ax, rects, texts):
ylim = ax.get_ylim()[1]
for ii, rect in enumerate(rects):
height = rect.get_height()
ax.text(rect.get_x()+rect.get_width()/2.,
height + 0.05*ylim,
'%.2f%%'% (texts[ii]),
rotation='vertical',
color='black',
fontsize=7,
ha='center',
va='bottom')
def make_chart(self,
xlabel=None,
xtick_labels=None,
xlabel_fontsize=7,
savefn=None,
xticks_rotation=0,
xticks_fontsize=6,
custom_colors=None,
bar_align='edge',
adjust_bottom=0.18,
suptitle_fontsize=13,
title=''):
toplot = self._prepare_chart_data()
fig, ax = plt.subplots(3, 3, sharey=True, sharex=True, figsize=(6, 6))
fig.subplots_adjust(top=0.89, bottom=adjust_bottom, hspace=0.25)
fig.suptitle(title, fontsize=suptitle_fontsize, weight='bold')
axes = np.ravel(ax)
categories = range(len(self.df.ix[:, 0].tolist()))
colors = custom_colors if custom_colors\
else self._gen_colors(len(categories))
if xtick_labels is None and self.categorized:
xtick_labels = self.categorized.levels.tolist()
elif xtick_labels is None:
xtick_labels = [str(i) for i in categories]
for sp_num, plot_data in enumerate(reversed(toplot)):
values = [i[1] for i in plot_data[1]]
axes[sp_num].set_title(plot_data[0].decode('utf-8')[:25],
fontsize=10)
axes[sp_num].grid()
rects = axes[sp_num].bar(categories,
values,
alpha=0.99,
color=colors,
align=bar_align)
#self._autolabel(axes[sp_num], rects, values)
if xtick_labels is not None:
axes[sp_num].set_xticks(range(len(xtick_labels)))
axes[sp_num].set_xticklabels(xtick_labels,
fontsize=xticks_fontsize,
rotation=xticks_rotation,
ha='center')
elif self.categorized:
axes[sp_num].set_xticklabels(xtick_labels)
for tick in axes[sp_num].yaxis.get_major_ticks():
tick.label.set_fontsize(6)
if sp_num in (0, 3, 6):
axes[sp_num].set_ylabel(u'% udostępnień', fontsize=7)
if sp_num > 5 and xlabel is not None:
axes[sp_num].set_xlabel(xlabel, fontsize=xlabel_fontsize)
ssizes = ' '.join(['N("%s")=%s; '\
% (i[0], i[1][2]) for i in zip(xtick_labels, toplot[0][1])])
ssizes = '\n'.join(wrap(ssizes, 100))
fig.text(0.12, 0.02, ssizes,
backgroundcolor='white', color='black', weight='roman',
size=6)
if savefn is None:
plt.show()
else:
plt.savefig(os.path.join('results', savefn))
self._reset()
def do_relationship_status(self, title, savefn, drop_features=None):
self.groupby('relationship_status', drop_factor_vals=(2, 7, 8, 9, 10, 11))
self.make_chart(xlabel=u'Status związku',
xtick_labels=rs_codes,
xticks_rotation=50,
xticks_fontsize=6,
adjust_bottom=0.21,
title=title,
savefn=savefn)
def do_location_size(self, title, savefn, drop_features=None):
self.groupby('location_size',
drop_factor_vals=(0, ),
drop_features=drop_features)
self.make_chart(xlabel=u'Wielkość lokalizacji',
xtick_labels=location_codes.values(),
xticks_rotation=45,
suptitle_fontsize=10,
adjust_bottom=0.19,
xticks_fontsize=6,
title=title,
savefn=savefn)
def do_hometown_size(self, title, savefn, drop_features=None):
self.groupby('hometown_size',
drop_factor_vals=(0, ),
drop_features=drop_features)
self.make_chart(xlabel=u'Wielkość miejsca pochodzenia',
xtick_labels=location_codes.values(),
xticks_rotation=45,
xlabel_fontsize=7,
adjust_bottom=0.19,
xticks_fontsize=6,
suptitle_fontsize=10,
title=title,
savefn=savefn)
def do_gender(self, title, savefn, drop_features=None):
self.groupby('is_male', drop_features=drop_features)
self.make_chart(xlabel=u'Płeć',
xtick_labels=(u'Kobieta', u'Mężczyzna'),
custom_colors=('pink', '#6495ED'),
xticks_fontsize=6,
bar_align='center',
adjust_bottom=0.1,
title=title,
savefn=savefn)
def do_age(self, title, savefn, drop_features=None):
age_range = (0, )
self.groupby('age_range',
drop_factor_vals=age_range,
drop_features=drop_features)
self.make_chart(xlabel=u'Wiek',
xtick_labels=('13-18', '19-24', '25-30', '31-36',
'37-42', '43-48', '49-54', '55-60', '61-66'),
xticks_rotation=45,
xticks_fontsize=6,
adjust_bottom=0.14,
title=title,
savefn=savefn)
def do_number_of_friends(self, title, savefn):
self.groupby('number_of_friends_range',
drop_factor_vals=(0, 1),
drop_rows=( ('is_full_feed', 0), ))
self.make_chart(xlabel=u'Liczba znajomych',
xtick_labels=('51-100', '101-150', '151-200',
'201-250', '251-300', '301-350', '351-400',
'401-450', '451-500', '501+'),
xticks_rotation=45,
xticks_fontsize=6,
adjust_bottom=0.15,
title=title,
savefn=savefn)