<h2>Data Fetching</h2>
<p style="text-indent :2em;"><a href="#Fetching-from-elastic"> 1. Fetching From Elastic.</a></p>
<p style="text-indent :2em;"><a href="#Cleaning"> 2. Cleaning.</a></p>

<br>

In [1]:
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from tqdm import tqdm
import csv
import os

## Fetching from elastic

In [2]:
es = Elasticsearch(['http://192.168.1.92:9200'], timeout=30, max_retries=10, retry_on_timeout=True)

In [3]:
def extractTwitterData(outputFile, fieldNames, ignoredFields, queryBody, index = 'tweets'):
    docCount = 0
    with open(outputFile, 'a') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldNames)
        writer.writeheader()
        for record in tqdm(helpers.scan(es, index=index, query=queryBody)):      
            recordId = record['_id']
            source = record['_source']
            source['_id'] = recordId
            for f in ignoredFields:
                if (source.get(f)):
                    del source[f]

            writer.writerow(source)
            docCount += 1
            
    print("number of rows:", docCount)

In [6]:
tweetQueryBody = {
  "query": {
    "bool": { 
      "should": [
         {
             "match_phrase": {
                "tweet": "کار از منزل"
             }
         },
         {
             "match_phrase": {
                "tweet": "کار در منزل"
             }
         },
         {
             "match_phrase": {
                "tweet": "کار از خانه"
             }
         },
         {
             "match_phrase": {
                "tweet": "کار در خانه"
             }
         },
         {
            "match_phrase": {
                "tweet": "کار از راه دور"
            }
         },
         {
            "match_phrase": {
                "tweet": "کار ریموت"
            }
         },
         {
            "match_phrase": {
                "tweet": "کار از دور"
            }
         },
         { 
           "query_string": {
                "query": """(دورکاری) 
                        OR (دورکار)
                        OR (teleworking)
                        OR (telecommuting)
                        OR (flexiplace)
                        OR (flexiwork)
                        """, 
                "default_field": "tweet"
            }
        }
      ]
    }
  }
}
tweetOutputFile = '../data/remote_tweets.csv'
tweetFieldNames = ['language', 'conversation_id', 'retweets_count', 'mentions', 'created_at', 
              'likes_count', 'possibly_sensitive', 'in_reply_to_user_id', '_id', 'source', 
              'user_id', 'in_reply_to_status_id', 'tweet', 'replies_count', 'quoted_status_id', 'hashtags', 'cashtags', 'urls']
ignoredTweetFields = ['thumbnail', 'photos', 'videos']

### Tweets

In [7]:
extractTwitterData(tweetOutputFile, tweetFieldNames, ignoredTweetFields, tweetQueryBody)

43441it [05:52, 123.14it/s]

number of rows: 43441





In [8]:
userQueryBody = {}
userOutputFile = '../data/twitter_users.csv'
userFields = ['_id', 'username', 'created_at', 'tweets', 'following', 'followers', 'likes', 'media_count', 'verified']
ignoredUserFields = ['bio', 'profile_image_url', 'background_image', 'pinned_status_ids']

### Users

In [9]:
extractTwitterData(userOutputFile, userFields, ignoredUserFields, userQueryBody, 'users')

7520298it [34:03, 3680.70it/s] 

number of rows: 7520298





## Cleaning

In [10]:
import pandas as pd

In [11]:
df_list = []

chunksize = 10 ** 5
with pd.read_csv(tweetOutputFile, chunksize=chunksize) as reader:
    for chunk in tqdm(reader):
        chunk = chunk[chunk.language != 'language']
        chunk.created_at = pd.to_datetime(chunk.created_at)
        
        del chunk['language']
        del chunk['possibly_sensitive']

        chunk.replies_count = pd.to_numeric(chunk.replies_count)
        chunk.retweets_count = pd.to_numeric(chunk.retweets_count)
        chunk.likes_count = pd.to_numeric(chunk.likes_count)
        df_list.append(chunk)

  exec(code_obj, self.user_global_ns, self.user_ns)
1it [00:00,  1.97it/s]


In [12]:
result = pd.concat(df_list)
result

Unnamed: 0,conversation_id,retweets_count,mentions,created_at,likes_count,in_reply_to_user_id,_id,source,user_id,in_reply_to_status_id,tweet,replies_count,quoted_status_id,hashtags,cashtags,urls
0,1375559420176908296,3,,2021-03-26 21:24:57+00:00,163,,1375559420176908296,Twitter for iPhone,812764546012246017,,اینایی که رفتن سفر، مهمونی و... هیچ پروتکلی هم...,1,,,,
1,1383828354197979136,2,,2021-04-18 17:02:44+00:00,37,,1383828354197979136,Twitter for Android,985101690536898565,,دارم فکر میکنم که چقدر خسته ام با اینکه امروز ...,3,,,,
2,1383840315031781385,0,,2021-04-18 17:50:16+00:00,11,,1383840315031781385,Twitter Web App,310871673,,یک سال و نیمه از قبل آبان ۹۸ دورکار شدم تا الان,1,1.383657e+18,,,
3,1383820625354002439,1,['15782990'],2021-04-18 17:31:42+00:00,9,1.578299e+07,1383835643239747593,Twitter Web App,1271095835028307969,1.383821e+18,@shahrzaad حتمن میرفتم به شهری که به اقوام یا ...,1,,,,
4,1383830770125443083,0,,2021-04-18 17:12:20+00:00,8,,1383830770125443083,Twitter Web App,897798038563033088,,دورکاری یا کار از خونه خیلی برام فرسایشی شده د...,2,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43436,1434127275587428352,0,['1273546506012897281'],2021-09-04 12:46:01+00:00,3,1.273547e+18,1434135662396018689,Twitter for Android,1260775960267444225,1.434127e+18,@anar_khanoom آخ لجم میگیره از این همه کثافت د...,0,,,,
43437,1434127961943326723,0,['1251428296589488129'],2021-09-04 12:21:54+00:00,2,1.251428e+18,1434129593036939267,Twitter for Android,1214804130096668672,1.434128e+18,@Nabinabia صرفا جهت اطلاع، ۶ ماهه رزومه فرستاد...,1,,,,
43438,1434138057071112194,0,,2021-09-04 12:55:32+00:00,4,,1434138057071112194,Twitter for iPhone,1147481863138811905,,اگه خواهرم این کار رو استخدام بشه، دورکاری خوا...,0,,,,
43439,1434133064750338048,0,,2021-09-04 12:35:42+00:00,18,,1434133064750338048,Twitter for Android,1106832761216815104,,دورکاری https://t.co/GYWHxcI9Z4,0,,,,


In [13]:
result.to_csv(tweetOutputFile, index=False)