In [1]:
from os import listdir
from os.path import isfile, join
import pandas as pd
from datetime import timedelta

In [2]:

csv_files= [f for f in listdir('../data/raw') if isfile(join('../data/raw', f))]

# General File Format

In [3]:
pd.read_csv(f'../data/raw/{csv_files[0]}')

Unnamed: 0.1,Unnamed: 0,uid,title,time,source,time-capture,sc,domain,full-text,sc2,geo
0,0,405ca5ff-f11a-42bf-8ba7-4856c9c88921,Czech Prime Minister: The Russian army committ...,a few seconds ago,https://twitter.com/AJABreaking/status/1510630...,2022-04-03 10:54:54.486609,False,"data:image/svg+xml;base64,PHN2ZyB2ZXJzaW9uPSIx...",SourceOn live map\nTell friends\na few seconds...,False,50°5′N 14°25′E
1,1,58755ebd-e61e-4311-9c2c-4a088a736ec9,Heavy shelling in Kharkiv again,7 minutes ago,https://t.me/kharkivlife/33583,2022-04-03 10:54:59.073486,False,"data:image/svg+xml;base64,PHN2ZyB2ZXJzaW9uPSIx...",SourceOn live map\nTell friends\n8 minutes ago...,False,50°2′N 36°15′E


## Getting unique times 

__Times are formatted weirdly__
- All filenames of csv represent time of retrieval 
- time column contains the time relative to file name 

In [4]:
unique_queue = []
for file_name in csv_files:
    df = pd.read_csv(f'../data/raw/{file_name}')
    unique_queue = unique_queue + df['time'].unique().tolist()

In [5]:
unique_queue

['a few seconds ago',
 '7 minutes ago',
 '4 hours ago',
 nan,
 '5 hours ago',
 '6 hours ago',
 '7 hours ago',
 '8 hours ago',
 '9 hours ago',
 '10 hours ago',
 '11 hours ago',
 '12 hours ago',
 '13 hours ago',
 '14 hours ago',
 '15 hours ago',
 '16 hours ago',
 '41 minutes ago',
 nan,
 '6 hours ago',
 '7 hours ago',
 nan,
 '8 hours ago',
 '9 hours ago',
 '10 hours ago',
 '11 hours ago',
 '12 hours ago',
 '13 hours ago',
 '14 hours ago',
 '16 hours ago',
 '17 hours ago',
 '18 hours ago',
 '19 hours ago',
 '20 hours ago',
 '21 hours ago',
 'a minute ago',
 '23 minutes ago',
 '33 minutes ago',
 '2 hours ago',
 'a few seconds ago',
 '10 minutes ago',
 '11 minutes ago',
 '14 minutes ago',
 '39 minutes ago',
 '2 hours ago',
 '3 hours ago',
 '4 hours ago',
 '5 hours ago',
 '6 hours ago',
 '7 hours ago',
 '8 hours ago',
 '16 hours ago',
 nan,
 '10 minutes ago',
 '16 minutes ago',
 '22 minutes ago',
 'a day ago',
 '2 minutes ago',
 '4 minutes ago',
 '13 minutes ago',
 '39 minutes ago',
 '4 minu

__Formats found__<br/>

a day ago <br/>
__X__ hours ago <br/>
an hour ago <br/>
__X__ minutes ago <br/>
a minutes ago <br/>
a few seconds ago <br/>
nan



## Getting time reference

In [6]:
csv_files[0]

'2022-04-03 10:55:03.458180.csv'

In [7]:
from dateutil import parser
parser.parse(csv_files[0].split('.')[0])

datetime.datetime(2022, 4, 3, 10, 55, 3)

----------------------------
----------------------------
----------------------------

# Creating Final Dataset for one file 

In [8]:
filename = csv_files[0]
df = pd.read_csv(f'../data/raw/{filename}')
table_reference_time = parser.parse(filename.split('.')[0])

In [9]:
# Dropping na in time column
df.dropna(subset=['time'],inplace=True)

In [11]:
def table_date_parser (row,ref_date):
    table_date = row['time']
    
    if table_date == 'a day ago':
        delta = timedelta(days=-1)
        return ref_date + delta
    
    elif table_date == 'an hour ago':
        delta = timedelta(hours=-1)
        return ref_date + delta
    
    elif 'hours ago' in table_date:
        delta = timedelta(hours=-1*int(table_date.split(' ')[0]))
        return ref_date + delta
    
    elif table_date == 'a minute ago':
        delta = timedelta(minutes=-1)
        return ref_date + delta
    
    elif 'minutes ago' in table_date:
        delta = timedelta(minutes=-1*int(table_date.split(' ')[0]))
        return ref_date + delta
    
    elif table_date == 'a few seconds ago':
        return ref_date 
    
    else:
        print('Problem Parsing date')
        return ref_date

In [14]:
df['time_parsed'] = df.apply (lambda row: table_date_parser(row,table_reference_time), axis=1)
df

Unnamed: 0.1,Unnamed: 0,uid,title,time,source,time-capture,sc,domain,full-text,sc2,geo,time_parsed
0,0,405ca5ff-f11a-42bf-8ba7-4856c9c88921,Czech Prime Minister: The Russian army committ...,a few seconds ago,https://twitter.com/AJABreaking/status/1510630...,2022-04-03 10:54:54.486609,False,"data:image/svg+xml;base64,PHN2ZyB2ZXJzaW9uPSIx...",SourceOn live map\nTell friends\na few seconds...,False,50°5′N 14°25′E,2022-04-03 10:55:03
1,1,58755ebd-e61e-4311-9c2c-4a088a736ec9,Heavy shelling in Kharkiv again,7 minutes ago,https://t.me/kharkivlife/33583,2022-04-03 10:54:59.073486,False,"data:image/svg+xml;base64,PHN2ZyB2ZXJzaW9uPSIx...",SourceOn live map\nTell friends\n8 minutes ago...,False,50°2′N 36°15′E,2022-04-03 10:48:03


# Generalizing for all dataset

In [18]:
totalDF = pd.DataFrame()

In [24]:
frames = []
for filename in csv_files:
    df = pd.read_csv(f'../data/raw/{filename}')
    table_reference_time = parser.parse(filename.split('.')[0])
    df.dropna(subset=['time'],inplace=True)
    df['time_parsed'] = df.apply (lambda row: table_date_parser(row,table_reference_time), axis=1)
    frames.append(df)
    
    

In [25]:
totalDF = pd.concat(frames)

In [27]:
totalDF.to_csv('../data/processed/input-data.csv')