# Imports 

In [1]:
from math import ceil
from time import sleep
from random import randint
from bs4 import BeautifulSoup


from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

import pandas as pd
import numpy as np
import requests
import json
import pickle


ModuleNotFoundError: No module named 'bs4'

# SPSS Fitting

## Macro

In [165]:
with open('data/campaigns.pkl', 'rb') as f:
    projects = pickle.load(f)

In [166]:
STIMULI = ['continuez', 'partager', 'partagez', 'on compte sur vous', 'parlez', 'likez', 'contribuer', 'contribuez',
           'aidez', 'soutenez', 'share', 'likez', 'donnez', 'persuadez', 'j-']

In [183]:
for p in projects:
    # DONATIONS
    p.num_don = len(p.donations)
    amounts = [don.amount for don in p.donations if not type(don) is str]
    p.mean_don = sum(amounts)/len(amounts)
    p.jump_1 = 0
    p.jump_mean = 0
    for d in p.donations:
        if d.amount > 1:
            p.jump_1 += 1
        if d.amount > p.mean_don:
            p.jump_mean +=1
    
    # ACTUALITIES
    p.num_pers = 0
    p.num_info = 0
    response = requests.get(p.link + '/tabs/description')
    sleep(randint(0, 4))
    while response.status_code != 200:
        response = requests.get(p.link + '/tabs/description')
        sleep(randint(0, 4))
    soup = BeautifulSoup(response.content, 'html5lib')
    sub_prompt = soup.find_all('div', class_='gDdVEG')
    p.end_date = pd.to_datetime(prompt[1].text, format='%d/%m/%Y')
    for actu in p.actualities:
        if actu.date > p.end_date:
            actu.kind = 'oos'
        else:
            if any(stimulus in text for stimulus in STIMULI for text in [actu.content.lower(), actu.title.lower()]):
                actu.kind = 'pers'
                p.num_pers += 1
            else:
                actu.kind = 'inf'
                p.num_info += 1

In [169]:
campaigns = [[c.link, c.title, c.description, c.project_holder, c.current_amount, c.aimed_amount, c.nb_contrib, c.complation_rate, c.catefories, c.comments, c.num_pers, c.num_info, c.end_date, c.num_don, c.mean_don, c.jump_1, c.jump_mean] for c in projects]

In [170]:
df = pd.DataFrame(campaigns, columns=['link', 'title', 'desc', 'project_holder', 'amount', 'aimed_amount', 'nb_contrib', 'completion_rate', 'categories', 'comments', 'num_pers', 'num_info', 'end_date', 'num_don', 'mean_don', 'num_jump_1', 'num_jump_mean'])

In [171]:
df.loc[:, 'nb_actus'] = df.num_pers + df.num_info
df.loc[:, 'duration'] = 60
df.loc[:, 'over_success'] = df.completion_rate > 105
df.loc[:, 'start_date'] = df.end_date - pd.to_timedelta(df.duration, unit='D')
df.loc[:, 'nb_max_reward_lvl'] = ''
df.loc[:, 'facebook'] = ''
df.loc[:, 'nb_likes_fb'] = ''
df.loc[:, 'other_projects'] = ''

In [172]:
cats = []
for c in set(df.categories):
    cats.extend(c.split(', '))
cats = list(set(cats))

for cat in cats:
    df.loc[:, cat] = df.categories.str.contains(cat)

In [173]:
df.to_csv('data/output/macro.csv', sep=';')

## Micro

In [185]:
cols = [ 'jump_1_j', 'jump_1_j_nb', 
        'jump_avg_j', 'jump_avg_j_nb']


In [225]:
project_list = []
for p in projects:
    current_date = p.end_date.date() - pd.to_timedelta(59, unit='d')
    event_dates = set([e.date for e in sorted(p.actualities + p.donations, key=lambda x: x.date)])
    event_by_date = {}
    for d in event_dates:
        event_by_date[d] = {}
        event_by_date[d]['pers'] = [a for a in p.actualities if a.kind == 'pers' and a.date == d]
        event_by_date[d]['inf'] = [a for a in p.actualities if a.kind == 'inf' and a.date == d]
        event_by_date[d]['don'] = [don for don in p.donations if don.date == d]
    

    p_amount = 0
    nb_don = 0
    
    for i in range(1, 61):
        d = {}
        
        d['link'] = p.link
        d['day'] = i
        try:
            day_data = event_by_date[current_date]
        except KeyError:
            day_data = {'pers': [], 'inf': [], 'don': []} 

        
        nb_don += len(day_data['don'])
        d['attrac_rel_j'] = nb_don/i
        d['above_attrac_rel_j'] = len(day_data['don']) > d['attrac_rel_j']
        d['n_don_j'] = len(day_data['don'])
        d['don_j'] = len(day_data['don']) >= 1

        rel_don = [don.amount for don in day_data['don'] if type(don.amount) is not str]
        somme_rel_don = sum(rel_don)
        p_amount += somme_rel_don
        
        d['avg_don_j'] = somme_rel_don/max(len(rel_don), 1)
        d['jump_1_j_nb'] = len([don for don in rel_don if don > 1])
        d['jump_1_j'] = d['jump_1_j_nb'] != 0
        
        d['jump_avg_j_nb'] = len([don for don in rel_don if don > p.mean_don])
        d['jump_avg_j'] = d['jump_avg_j_nb'] != 0
        
        
        d['n_inf_j'] = len(day_data['inf'])
        d['inf_j'] = len(day_data['inf']) >= 1

        d['n_pers_j'] = len(day_data['pers'])
        d['pers_j'] = len(day_data['pers']) >= 1
        
        if type(p.aimed_amount) is not str:
            d['compl_rate_j'] = int(p_amount / p.aimed_amount * 100)
            d['over_success_j'] = p_amount > p.aimed_amount
        d['n_comment_j'] = None 
        d['comment_j'] = None 
        d['n_don_level_j'] = None
        d['don_level_j'] = None
        
        
        project_list.append(d)
        current_date = current_date + pd.to_timedelta(1, unit='d')

df = pd.DataFrame(project_list)

[datetime.date(2020, 5, 7)]
dict_keys([datetime.date(2020, 6, 9), datetime.date(2020, 6, 13), datetime.date(2020, 7, 6), datetime.date(2020, 6, 27), datetime.date(2020, 5, 27), datetime.date(2020, 5, 21), datetime.date(2020, 6, 8), datetime.date(2020, 5, 22), datetime.date(2020, 7, 5), datetime.date(2020, 6, 22), datetime.date(2020, 6, 26), datetime.date(2020, 6, 7), datetime.date(2020, 6, 14), datetime.date(2020, 6, 6), datetime.date(2020, 6, 5), datetime.date(2020, 5, 26), datetime.date(2020, 5, 20), datetime.date(2020, 6, 16), datetime.date(2020, 5, 24), datetime.date(2020, 6, 2), datetime.date(2020, 6, 15), datetime.date(2020, 6, 28), datetime.date(2020, 6, 4), datetime.date(2020, 7, 8)])
[datetime.date(2020, 5, 7)]
dict_keys([datetime.date(2020, 7, 7), datetime.date(2020, 6, 9), datetime.date(2020, 6, 11), datetime.date(2020, 6, 10), datetime.date(2020, 6, 13), datetime.date(2020, 7, 3), datetime.date(2020, 7, 6), datetime.date(2020, 6, 30), datetime.date(2020, 6, 19), datetime.da

In [230]:
df.to_csv('data/output/micro.csv', sep=';')