In [44]:
import pandas as pd
import numpy as np
from datetime import datetime

In [45]:
df = pd.read_csv('data/ks-projects-201801.csv')

In [46]:
# If 'name' field is empty, drop the row
index_to_remove = []
for index, row in df.iterrows():
    if row['name'] != row['name']:
        index_to_remove.append(index)

df = df.drop(index_to_remove)
print("Removed %d rows with no 'name' field" %len(index_to_remove))

Removed 4 rows with no 'name' field


In [47]:
# Only keep rows with 'failed' and 'successful', and encode these two values (failed: 0, successful: 1)
drop_states = ['live', 'canceled', 'suspended', 'undefined']
df.drop(df[df.state.isin(drop_states)].index, inplace=True)

results = []
for state in df['state']:
    if state == 'failed':
        results.append(0)
    else:
        results.append(1)

In [48]:
launched, duration = [], []
for start, end in zip(df['launched'], df['deadline']):
    start_date = datetime.strptime(start, '%Y-%m-%d %H:%M:%S')
    end_date = datetime.strptime(end, '%Y-%m-%d')
    days = (end_date.date() - start_date.date()).days
    duration.append(days)
    launched.append(int(round(start_date.timestamp())))

In [49]:
# Drop some unrelated columns
drop_cols = ['ID', 'deadline', 'goal', 'launched', 'pledged', 'state', 'usd pledged', 'usd_pledged_real']
df.drop(axis=1, inplace=True, labels=drop_cols)

In [50]:
df['duration'] = duration
df['results'] = results

In [51]:
main_category_success = df.groupby('main_category')['results'].mean()
df.main_category = df.main_category.apply(func=lambda x: main_category_success.at[x])

category_success = df.groupby('category')['results'].mean()
df.category = df.category.apply(func=lambda x: category_success.at[x])

currency_success = df.groupby('currency')['results'].mean()
df.currency = df.currency.apply(func=lambda x: currency_success.at[x])

country_success = df.groupby('country')['results'].mean()
df.country = df.country.apply(func=lambda x: country_success.at[x])

In [None]:
# Add some shallow linguistic features as metadata

In [None]:
"""
<preproc>
- remove (Canceled), (Suspended)
- remove quotation in the beginning and at the end
- remove quotation elongation

- Num of words
- Num of named entities
- Use of quotation
- Use of elongation
- Num of all capital words
- Num of special characters (:, !, ?, #, etc.)

- Glove word embedding + average
- TF-IDF embedding?
"""

In [106]:
s = '"Air Bonsai | Create your ""little star"""'
ss = stringProc(s)
ss

'Air Bonsai | Create your "little star"'

In [59]:
def stringProc(s):
    s = s.strip()
    s = s.replace("(Canceled)",'').replace("(Suspended)",'')
    while True:
        if s.startswith('"') and s.endswith('"'):
            s = s[1:-1]
        else:
            break
    s = s.replace('""""','"').replace('"""','"').replace('""','"')
    return s

In [37]:
from tqdm import tqdm

In [53]:
import spacy
nlp = spacy.load("en")

def stringEncoding(s):
    doc = nlp(s)
    
    numWords = len(doc)
    numEntities = len(doc.ents)
    
    numAllCap, numSpecialChar = 0, 0
    for tok in doc:
        if tok.text == tok.text.upper():
            numAllCap += 1
        if tok.text in "`~!@#$%^&*()-_=+{}[]:;<>?/'\|,.":
            numSpecialChar += 1
            
    
    useOfQuot = 1 if '"' in doc.vocab else 0
    
    return numWords, numEntities, numAllCap, numSpecialChar, useOfQuot

In [60]:
names = []
for title in df['name']:    
    names.append(stringProc(title))

In [62]:
numWords, numEntities, numAllCap, numSpecialChar, useOfQuot = [], [], [], [], []
for elem in tqdm(names):
    nw, ne, na, ns, uq = stringEncoding(elem)
    numWords.append(nw)
    numEntities.append(ne)
    numAllCap.append(na)
    numSpecialChar.append(ns)
    useOfQuot.append(uq)

100%|██████████| 331672/331672 [50:19<00:00, 109.84it/s] 


In [96]:
def metadata_clamp(results, threshold):
    output = []
    for elem in results:
        if elem >= threshold:
            output.append(threshold)
        else:
            output.append(elem)
    return output
            
numWords_trunc = metadata_clamp(numWords, 25)
numAllCap_trunc = metadata_clamp(numAllCap, 23)
numSpecialChar_trunc = metadata_clamp(numSpecialChar, 13)

In [97]:
df['num_words'] = numWords_trunc
df['num_entities'] = numEntities
df['num_allcaps'] = numAllCap_trunc
df['num_special'] = numSpecialChar_trunc
df['use_quot'] = useOfQuot
df['name'] = names

In [64]:
columns_titles = ['name', 'category', 'main_category', 'currency', 'country', 'duration', 'backers', 'usd_goal_real', 'num_words', 'num_entities', 'num_allcaps', 'num_special', 'use_quot', 'results']
df = df.reindex(columns=columns_titles)

In [104]:
df_test = df.sample(random_state=47, frac=0.2)
df_train = df.drop(df_test.index)

In [109]:
df_test['name'].tolist()[:100]

['Origami Star Flower',
 'Finish The Tales Of Extraordinary Beings Series',
 'Petitjean Paris | A silk ready-to-wear line',
 'Newsome Awards',
 'Make "Ever With You" possible to be released this year!!!',
 'Sleeping Bag Sheet',
 'Whale Stapler',
 'Resurrecting Rodney',
 'National Park Postcard Adventure',
 'ALEX LONDON NYFW 2012 SPRING SUMMER NEEDS YOUR HELP!',
 'The Holy Land Project',
 'NINJA HIGH SCHOOL Expandable Card Game',
 'Steppin with style tv show',
 '360° India',
 "Meet SYRE, The World's First Bluetooth iPod Nano Watch Case",
 '"Those Behind" a Horror Novel By Tim Scalita',
 'Emmerst - Self Titled Deathcore / Nu Metal EP',
 'The Barefoot Movement "Figures of the Year"',
 "PEANUTBUTTER & JELLY PLAYERS Children's Theater",
 'Custom Art Card Sleeves, for Magic the Gathering, etc.',
 'Copse GP Greenpower Team',
 'Legends of the Boo-Monster',
 'EXP puppet troupe is currently filming "King Daddy Sunshine"',
 'I want to build a Trebuchet and have fun.',
 'Ovulation (a drama / horro

In [110]:
df_test.to_csv('testSet.csv', sep=',', index=False)
df_train.to_csv('trainingSet.csv', sep=',', index=False)

In [None]:
"""
Attributes to use: 
    - name
    - category
    - main_category
    - backers
    - duration (deadline - launched)
    - country
    - usd_goal_real

Objective:
    - regress usd_pledged_real OR percentage (usd_pledged_real / usd_goal_real * 100)
    - predict state (successful or not)
"""

In [None]:
# Preprocess category, main_category, country
def attr_encoding(df, attr_name):
    encoded = []
    unique_values = sorted(list(set(df[attr_name])))
    for elem in df[attr_name]:
        encoded.append(unique_values.index(elem))
    return encoded

category = attr_encoding(df_copy, 'category')
main_category = attr_encoding(df_copy, 'main_category')
country = attr_encoding(df_copy, 'country')

In [None]:
cs = sorted(Counter(df_copy['country']).items(), key=lambda x:x[1], reverse=True)
x = [key for key, val in cs]
label = [val for key, val in cs]
plt.bar(x, label)
plt.show()

In [None]:
"""
num_backers: skewed distribution -> range of log scale count
0: 55609
1, 2: 58065
3 ~ 7: 52998
8 ~ 20: 53692
21 ~ 47: 53213
48 ~ 116: 52449
117 ~ : 52635
"""
def encodeBackers(num_backers):
    encode_dict = {0: [0], 1: [1, 2], 2: range(3, 8), 3: range(8, 21), 4: range(21, 48), 5: range(48, 117)}
    encoded_backers = -1
    for key, val in encode_dict.items():
        if num_backers in val:
            encoded_backers = key
    if encoded_backers == -1:
        encoded_backers = 6
    return encoded_backers

backers = []
for elem in df_copy['backers']:
    backers.append(encodeBackers(elem))

In [None]:
from datetime import datetime

duration = []
duration_raw = []
for start, end in zip(df_copy['launched'], df_copy['deadline']):
    start_date = datetime.strptime(start, '%Y-%m-%d %H:%M:%S').date()
    end_date = datetime.strptime(end, '%Y-%m-%d').date()
    days = 0 if (end_date - start_date).days <= 30 else 1
    duration.append(days)
    duration_raw.append((end_date - start_date).days)

In [None]:
max(df_copy['usd_goal_real'])

In [None]:
from collections import Counter

In [None]:
def returnGoalEncodeDict(df):
    sorted_ctr = sorted(Counter(df['usd_goal_real'].values.tolist()).items(), key=lambda x:x[0])
    counter, range_threshold = 0, []
    for key, val in sorted_ctr:
        counter += val
        if counter >= 5000 and key == int(key):
            # print(int(key), counter)
            range_threshold.append(int(key))
            counter = 0
            
    encode_dict = {key: None for key in range(len(range_threshold))}
    range_a, range_b = 0, 0
    eps = 1e-6
    for key in encode_dict.keys():
        range_b = range_threshold[key]
        encode_dict[key] = (range_a, range_b+eps)
        range_a = range_b+eps
        
    return encode_dict

def encodeGoal(goal_amount, encode_dict):
    encoded_goal = -1
    for key, val in encode_dict.items():
        if goal_amount >= val[0] and goal_amount < val[1]:
            encoded_goal = key
    if encoded_goal == -1:
        encoded_goal = len(encode_dict.keys())

    return encoded_goal

encode_dict = returnGoalEncodeDict(df_copy)

goals, pledged = [], []
for elem in df_copy['usd_goal_real']:
    goals.append(encodeGoal(elem, encode_dict))
for elem in df_copy['usd_pledged_real']:
    pledged.append(encodeGoal(elem, encode_dict))

In [None]:
states = []
for pled, goal in zip(df_copy['usd_pledged_real'], df_copy['usd_goal_real']):
    if pled >= goal:
        states.append(1)
    else:
        states.append(0)

In [None]:
cs = sorted(Counter(percentage).items(), key=lambda x:x[0], reverse=False)
keydict = {0: '0%', 1: '1~10%', 2: '11~99%', 3: '100~120%', 4: '121%~'}
x = [keydict[key] for key, val in cs]
label = [val for key, val in cs]
plt.bar(x, label)
plt.show()

In [None]:
def encodePercentage(p):
    encode_dict = {0: [0], 1: range(1, 11), 2: range(11, 100), 3: range(100, 121)}
    encoded_p = -1
    for key, val in encode_dict.items():
        if p in val:
            encoded_p = key
    if encoded_p == -1:
        encoded_p = 4
    return encoded_p

percentage = []
percentage_raw = []
for pled, goal in zip(df_copy['usd_pledged_real'], df_copy['usd_goal_real']):
    p = int(pled / goal * 100)
    percentage_raw.append(p)
    percentage.append(encodePercentage(p))

In [None]:
assert len(duration) == len(category) == len(main_category) == len(country) == len(backers) == len(goals) == len(pledged) == len(states) == len(percentage)

In [None]:
import fasttext.util

In [None]:
fasttext.util.download_model('en', if_exists='ignore')

In [None]:
ft = fasttext.load_model('cc.en.300.bin')

In [None]:
numWords, numEntities, numAllCap, numSpecialChar, useOfQuot, useOfElong = [], [], [], [], [], []
for elem in names:
    nw, ne, na, ns, uq, ue, tok = stringEncoding(elem)
    numWords.append(nw)
    numEntities.append(ne)
    numAllCap.append(na)
    numSpecialChar.append(ns)
    useOfQuot.append(uq)
    useOfElong.append(ue)

In [None]:
c = "iCare FHD: World's First Urgent Home Care Camera"
doc2 = nlp(c)
for tok in doc2:
    print(tok)

In [None]:
"'s" in "`~!@#$%^&*()-_=+{}[]:;<>?/'\|,."

In [None]:
a = 'LEDIONIX #ONE - Linear T8 LED-Light - "One for all!!!"'
b = '1yrBeer - Discovering Craft Beer Culture in Colorado,Documentary'

In [None]:
"Linear".isalpha()

In [None]:
doc = nlp(a)
print(len(doc))
for tok in doc:
    print(tok)

In [None]:
type(doc)

In [None]:
doc = nlp(a)
tok_len = len(doc)
remove_elong = []
for i, tok in enumerate(doc):
    if i == tok_len-1:
        print("here", tok)
        remove_elong.append(tok)
    else:
        print(tok, doc[i+1])
        if tok.text == doc[i+1].text:
            continue
        else:
            remove_elong.append(tok)
print(remove_elong)

In [None]:
num_words = []
for elem in names:

In [None]:
set(df['state'].values.tolist())

In [None]:
for row in df.values:
    print(row)
    break

In [None]:
cnt = 0
for elem in df[df['state']=='canceled'].values:
    if elem[-2] >= elem[-1]:
        cnt += 1
print("canceled pledged count: %d out of %d" %(cnt, len(df[df['state']=='canceled'])))

cnt = 0
for elem in df[df['state']=='failed'].values:
    if elem[-2] >= elem[-1]:
        cnt += 1
print("failed pledged count: %d out of %d" %(cnt, len(df[df['state']=='failed'])))

cnt = 0
for elem in df[df['state']=='live'].values:
    if elem[-2] >= elem[-1]:
        cnt += 1
print("live pledged count: %d out of %d" %(cnt, len(df[df['state']=='live'])))

cnt = 0
for elem in df[df['state']=='successful'].values:
    if elem[-2] >= elem[-1]:
        cnt += 1
print("successful pledged count: %d out of %d" %(cnt, len(df[df['state']=='successful'])))

cnt = 0
for elem in df[df['state']=='suspended'].values:
    if elem[-2] >= elem[-1]:
        cnt += 1
print("suspended pledged count: %d out of %d" %(cnt, len(df[df['state']=='suspended'])))

cnt = 0
for elem in df[df['state']=='undefined'].values:
    if elem[-2] >= elem[-1]:
        cnt += 1
print("undefined pledged count: %d out of %d" %(cnt, len(df[df['state']=='undefined'])))

In [None]:
from spacy.tokens import Doc
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
def removeElongToken(doc):
    idx = []
    for i, tok in enumerate(doc):
        if i != tok_len-1 and tok.text == doc[i+1].text:
            idx.append(i)
    np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
    doc2 = Doc(doc.vocab, words=[t.text for i, t in enumerate(doc) if i not in idx])
    doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
    return doc2

In [None]:

import re
regex = re.compile(r'(.)\1{2}')