In [26]:
import scipy.stats as scs
import matplotlib.pyplot as plt
import pandas as pd
import time
import numpy as np
import re
import json
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import nltk
from datetime import date, datetime, timedelta
from collections import Counter
plt.style.use('fivethirtyeight')
sns.set_style('darkgrid')
%matplotlib inline

In [27]:
def get_date(dt):
    '''
    convert datetime string to python datetime.date output
    --------
    PARAMETERS
    dt: str - Date or datetime input (can also be in datetime.datetime format)
    --------
    RETURNS
    dateout: datetime.date - truncated datetime as date
    '''
    dlst = str(dt).strip().split()[0].split('-')
    return date(int(dlst[0]), int(dlst[1]), int(dlst[2]))

def get_count(cell):
    '''
    converts nltk parts of speech tuples into counts of parts of speech
    '''
    cnt_dict = Counter()
    for i in cell:
        cnt_dict[i[:2]] += 1
    return cnt_dict

In [54]:
df = pd.read_csv('data/data_nobadrows.csv')

In [55]:
df.columns

Index([u'Unnamed: 0', u'ID', u'Unnamed: 13', u'backers', u'category',
       u'country', u'currency', u'dataset', u'deadline', u'goal', u'launched',
       u'main_category', u'name', u'pledged', u'state', u'usd pledged',
       u'usd_pledged_real'],
      dtype='object')

In [60]:
df_ = df.sort_values('name')
df_.index = np.arange(df.shape[0])
df_1 = df_[df_.index % 2 == 0].iloc[:-1]
df_2 = df_[df_.index % 2 != 0]

In [64]:
exchange_rates.shape

(724, 10)

In [63]:
for col in df.columns:
    print("{} : {}".format(col,float((df_1[col] == df_2[col]).sum()) / df_1.shape[0]))

Unnamed: 0 : 2.84990410073e-06
ID : 0.453696183123
Unnamed: 13 : 0.0
backers : 0.476213275423
category : 0.502566338643
country : 0.797805003862
currency : 0.811265100929
dataset : 0.281978061438
deadline : 0.000316339355181
goal : 0.473431769021
launched : 0.453696183123
main_category : 0.58200741545
name : 0.452274080977
pledged : 0.463816192585
state : 0.684413019502
usd pledged : 0.401893476285
usd_pledged_real : 0.00439170221922


In [15]:
### Convert currencies

df = pd.read_csv('data/data_nobadrows.csv')
keep_cols = [x for x in df.columns if 'Unnamed' not in x]
df = df[keep_cols]
df.loc[:,'goal'] = df.loc[:,'goal'].astype(int)
df.loc[:,'pledged'] = df.loc[:,'pledged'].astype(int)
df.loc[:,'backers'] = df.loc[:,'backers'].astype(int)

df['launched'] = df['launched'].apply(get_date)
df['deadline'] = df['deadline'].apply(get_date)

get_nasty = df[np.isnan(df['usd pledged'])]
dates = get_nasty['launched'].unique()

currencys = ','.join(get_nasty['currency'].unique())

exchange_rates = pd.read_csv('data/exchange.csv').set_index('date')

get_nasty_ = get_nasty[get_nasty['currency'] != 'DKK']

get_nasty_['usd pledged'] = get_nasty_.T.apply(lambda x: x['pledged'] * \
                 exchange_rates.loc[str(x['launched']),'USD'] / \
                 exchange_rates.loc[str(x['launched']),x['currency']]) 

df.loc[get_nasty_.index, 'usd pledged'] = get_nasty['usd pledged']

df_ = df[~np.isnan(df['usd pledged'])]

df_.to_csv('data/data_with_usd.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [65]:
### add parts of speech tagging
df = pd.read_csv('data/data_with_usd.csv')
names = df['name']
df.loc[:,'name'] = df.loc[:,'name'].fillna('None')
lett_count = names.apply(lambda x: len(re.sub('[^a-z]', '', x.lower())))
word_count = names.apply(lambda x: len(re.findall("[a-z']+", x.lower())))
punc_count = names.apply(lambda x: len(re.findall("[\p{P}\d]", x))) / lett_count
caps_count = names.apply(lambda x: len(re.findall("[A-Z]", x))) / lett_count
words = names.apply(lambda x: re.findall("[a-z']+", x.lower()))
parts_of_speech = words.apply(lambda x: nltk.pos_tag(x))
just_pos = parts_of_speech.apply(lambda x: [i[1] for i in x])
pos_counts = just_pos.apply(get_count)
pos_counts.apply(lambda x: len(x)).max()

# pos_types = Counter()
# for cell in pos_counts:
#     for val in cell.keys():
#         pos_types[val] += cell[val]
        
possesive_count = pos_counts.apply(lambda x: x.get('WP', 0)) / word_count
noun_count = pos_counts.apply(lambda x: x.get('NN', 0)) / word_count
adj_count = pos_counts.apply(lambda x: x.get('JJ', 0)) / word_count
verb_count = pos_counts.apply(lambda x: x.get('VB', 0)) / word_count
preposition_count = pos_counts.apply(lambda x: x.get('IN', 0)) / word_count
determinator_count = pos_counts.apply(lambda x: x.get('DT', 0)) / word_count

df['name$word_count'] = word_count
df['name$punc_count'] = punc_count
df['name$caps_count'] = caps_count
df['name$possesive_count'] = possesive_count
df['name$noun_count'] = noun_count
df['name$adj_count'] = adj_count
df['name$verb_count'] = verb_count
df['name$preposition_count'] = preposition_count
df['name$determinator_count'] = determinator_count

keep_cols = [x for x in df.columns if 'Unnamed' not in x]
df = df[keep_cols]

df.to_csv('data/data_with_pos.csv')

In [66]:
### Add dummy cols
ndf = pd.read_csv('data/data_with_pos.csv')
for cat in df['category'].unique():
    ndf['category&{}'.format(cat)] = df['category'] == cat
for cat in df['main_category'].unique():
    ndf['main_category&{}'.format(cat)] = df['main_category'] == cat
for cat in df['country'].unique():
    ndf['country&{}'.format(cat)] = df['country'] == cat
for cat in df['currency'].unique():
    ndf['currency&{}'.format(cat)] = df['currency'] == cat
for cat in df['country'].unique():
    ndf['country&{}'.format(cat)] = df['country'] == cat    
ndf.drop(['category', 'main_category', 'country', 'currency', 'name', 'country'], axis=1, inplace=True)

ndf['deadline'] = df.loc[:,'deadline'].apply(pd.Timestamp)
ndf['launched'] = df.loc[:,'launched'].apply(pd.Timestamp)

ndf['deadline_month'] = ndf.loc[:,'deadline'].apply(lambda x: x.strftime('%b'))
ndf['launched_month'] = ndf.loc[:,'launched'].apply(lambda x: x.strftime('%b'))

ndf['launched_year'] = ndf.loc[:,'launched'].apply(lambda x: x.strftime('%Y'))

ndf['length'] = (ndf['deadline'] - ndf['launched']).apply(lambda x: x.days)

for cat in ndf['deadline_month'].unique():
    ndf['deadline_month&{}'.format(cat)] = ndf['deadline_month'] == cat
    
for cat in ndf['launched_month'].unique():
    ndf['launched_month&{}'.format(cat)] = ndf['deadline_month'] == cat

ndf.drop(['launched_month',
          'launched_year',
          'deadline_month',
          'launched',
          'deadline'], axis=1, inplace=True)

ndf.to_csv('data/data_with_dummies.csv')

In [67]:
### create binary classification dataset
ndf = pd.read_csv('data/data_with_dummies.csv')
ndf['success'] = (ndf['state'] == 'successful') | (ndf['pledged'] > ndf['goal'])
ndf.drop('state', axis=1, inplace=True)
keep_cols = [x for x in ndf.columns if 'Unnamed' not in x]
ndf = ndf[keep_cols]
ndf.to_csv('data/data_with_dummies_and_class.csv')

In [68]:
keep_cols = [x for x in ndf.columns if 'Unnamed' not in x]
ndf = ndf[keep_cols]
ndf.to_csv('data/data_with_dummies_and_class.csv')

In [69]:
df.sample(5).T

Unnamed: 0,648081,609210,216787,592124,575076
ID,761739897,560287160,375767142,472092348,383455460
backers,5,44,0,61,15
category,Product Design,Dance,Apps,Fashion,Photography
country,US,US,US,CA,US
currency,USD,USD,USD,CAD,USD
deadline,2013-02-08,2013-06-22,2015-10-11,2014-10-05,2012-08-01
goal,9000,2000,80000,27000,2000
launched,2013-01-09,2013-06-01,2015-08-12,2014-09-08,2012-07-02
main_category,Design,Dance,Technology,Fashion,Photography
name,Beach Bum Towels,CONVERSATION PIECES goes to HOLLYWOOD!,DROP OFF POINT,Cobra 3 Twin Crown: 200 Meter Mechanical Autom...,American Panograph Project
