In [None]:
import pandas as pd

def extract_sentence_count(df, text_col='clean_text', count_col=['doc_id'], grouped_by=[], time_col='dt',
                           time_bin=None,start_time = None,end_time = None):
    if len(df) == 0:
        _df = df.copy()
        return df

    if isinstance(start_time,type(None)):
        start_time = df[time_col].min()
    if isinstance(end_time,type(None)):
        end_time = df[time_col].max()


    if isinstance(time_bin, type(None)):
        bins = [start_time, end_time + pd.DateOffset(seconds=1)]

    else:
        timed, timet = int(time_bin[:-1]), time_bin[-1]
        if timet == 'D':
            dt = pd.DateOffset(days=timed)
            start_time = pd.Timestamp('{}/{}/{}'.format(start_time.year, start_time.month, start_time.day))

        elif timet == 'M':
            dt = pd.DateOffset(months=timed)
            start_time = pd.Timestamp('{}/{}/{}'.format(start_time.year, start_time.month, 1))
        elif timet == 'Y':
            dt = pd.DateOffset(months=timed)
            start_time = pd.Timestamp('{}/{}/{}'.format(start_time.year, 1, 1))
        else:
            raise Error('invalid syntax for time bin')

        bins = []
        while start_time <= end_time:
            bins.append(start_time)
            start_time += dt
        bins.append(start_time)

    _df = df.copy()
    _df['dt_bin'] = pd.cut(_df['dt'], bins, labels=None, right=False)

    grouped_by.append('dt_bin')

    col_to_keep = [text_col] + count_col + grouped_by
    _df = _df[col_to_keep].drop_duplicates()[[text_col] + grouped_by]
    _df = _df.groupby(grouped_by)[text_col].value_counts().rename(
        'count_{}_by_{}'.format(",".join(count_col), ",".join(grouped_by))).reset_index()

    _df['dt_bin_l'] = _df['dt_bin'].apply(lambda x : x.left)
    _df['dt_bin_r'] = _df['dt_bin'].apply(lambda x : x.right)
    _df = _df.drop(columns = ['dt_bin'])


    return _df


def aggregate_over_time(dfs, time_intervals, time_bin_cols=['dt_bin_l','dt_bin_r']):
    _dfs_in_time = []

    def aux(i, i_list):
        return [x for x in i_list if i.overlaps(x)]

    time_frames = pd.Series(time_intervals).astype(object).rename('time_frames')

    for df in dfs:
        _df = df.copy()
        s_time_bin_l = list(_df[time_bin_cols[0]].cat.categories)
        s_time_bin_r = list(_df[time_bin_cols[1]].cat.categories)
        s_time_bin = [pd.Interval(s_time_bin_l[k],s_time_bin_r[k],closed = 'left') for k in range(len(s_time_bin_r))]
        
        r = time_frames.apply(lambda x: aux(x, s_time_bin))
        r = pd.DataFrame(r.tolist()).stack().reset_index().drop(columns=['level_1']).set_index('level_0')
        r.columns = [time_bin_cols[0]]
        r[time_bin_cols[0]] =  r[time_bin_cols[0]].apply(lambda x : x.left)
        r = r.join(time_frames)
        _df[time_bin_cols[0]] =  _df[time_bin_cols[0]].astype(datetime64[ns])
        r = pd.merge(r, _df, on=[time_bin_cols[0]]).drop(columns=time_bin_cols)
        _dfs_in_time.append(r)
    return pd.concat(_dfs_in_time)





def make_sentence_count(df, sentence_col='clean_sentence', dfs_count=None, time_col='dt', freq='1M',
                        window='2Y',dfs_count_time_bin_cols = ['dt_bin_l','dt_bin_r']):
    # First of all, bin the data frame
    _df = df.copy()

    min_time = df[time_col].min()
    start_time = min_time
    end_time = df[time_col].max()

    if isinstance(freq, type(None)):
        freq = end_time - start_time + pd.Timedelta(seconds=1)

    else:
        time_scale = freq[-1]
        freq = int(freq[:-1])
        if time_scale == 'D':
            freq = pd.DateOffset(days=freq)
            start_time = pd.Timestamp('{}/{}/{}'.format(start_time.year, start_time.month, start_time.day))
        elif time_scale == 'M':
            freq = pd.DateOffset(months=freq)
            start_time = pd.Timestamp('{}/{}/01'.format(start_time.year, start_time.month))
        elif time_scale == 'Y':
            freq = pd.DateOffset(years=freq)
            start_time = pd.Timestamp('{}/01/01'.format(start_time.year))
        else:
            raise Error('invalid syntax for frequency')

    time_bins = []
    while start_time <= end_time:
        time_bins.append(start_time)
        start_time += freq
    time_bins.append(start_time)

    _df['time_frame'] = pd.cut(_df['dt'], time_bins, labels=None, right=False)

    if not (isinstance(window, type(None))):
        time_scale = window[-1]
        window = int(window[:-1])
        if time_scale == 'D':
            window = pd.DateOffset(days=window)
        elif time_scale == 'M':
            window = pd.DateOffset(months=window)
        elif time_scale == 'Y':
            window = pd.DateOffset(years=window)
        else:
            raise Error('invalid syntax for window')

    def aux_find_start_time(interval):
        if isinstance(window, type(None)):
            return pd.Interval(min_time, interval.right, closed='left')
        else:
            return pd.Interval(interval.right - window, interval.right, closed='left')

    _df['time_frame'] = _df['time_frame'].apply(aux_find_start_time)

    time_frames = list(_df['time_frame'].cat.categories)

    aux = aggregate_over_time(dfs_count, time_frames, time_bin_cols=dfs_count_time_bin_cols).groupby(
        ['time_frame', sentence_col]).sum().reset_index()
    aux.columns = ['time_frame', sentence_col, 'count']


    _df = pd.merge(aux, _df, how='right', on=['time_frame', sentence_col]).drop(columns=['time_frame'])
    _df = _df.fillna(1)
    _df['count'] = _df['count'].astype(int)
    return _df