# Data Fetch

* [Fetching from elastic](#Fetching-from-elastic)
    * [Tweets](#Tweets)
    * [Users](#Users)
* [Cleaning](#Cleaning)

In [1]:
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from tqdm import tqdm
import pandas as pd
import csv
import os

from pandas.core.common import SettingWithCopyWarning
import warnings
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

# Fetching from elastic

In [2]:
es = Elasticsearch(['http://192.168.1.92:9200'], request_timeout=30, max_retries=10, retry_on_timeout=True)

In [3]:
def extractTwitterData(outputFile, fieldNames, ignoredFields, queryBody, index = 'tweets'):
    docCount = 0
    with open(outputFile, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldNames)
        writer.writeheader()
        for record in tqdm(helpers.scan(es, index=index, query=queryBody)):      
            recordId = record['_id']
            source = record['_source']
            source['_id'] = recordId
            for f in ignoredFields:
                if (source.get(f)):
                    del source[f]

            writer.writerow(source)
            docCount += 1
            
    print("number of rows:", docCount)

In [4]:
tweetQueryBody = {
  "query": {
    "bool": { 
      "should": [
         {
             "match_phrase": {
                "tweet": "کار از منزل"
             }
         },
         {
             "match_phrase": {
                "tweet": "کار در منزل"
             }
         },
         {
             "match_phrase": {
                "tweet": "کار از خانه"
             }
         },
         {
             "match_phrase": {
                "tweet": "کار در خانه"
             }
         },
         {
            "match_phrase": {
                "tweet": "کار از راه دور"
            }
         },
         {
            "match_phrase": {
                "tweet": "کار ریموت"
            }
         },
         {
            "match_phrase": {
                "tweet": "کار از دور"
            }
         },
         { 
           "query_string": {
                "query": """(دورکاری) 
                        OR (دورکار)
                        OR (teleworking)
                        OR (telecommuting)
                        OR (flexiplace)
                        OR (flexiwork)
                        """, 
                "default_field": "tweet"
            }
        },
        {
             "match_phrase": {
                "tweet": "کار از خونه"
             }
         },
         {
             "match_phrase": {
                "tweet": "کار غیرحضوری"
             }
         },
         {
             "match_phrase": {
                "tweet": "کار غیر حضوری"
             }
         },
         {
             "match_phrase": {
                "tweet": " شغل غیرحضوری"
             }
         },
         {
             "match_phrase": {
                "tweet": "شغل غیر حضوری"
             }
         },
         {
            "match_phrase": {
                "tweet": "شغل خانگی"
            }
         }  
      ]
    }
  }
}
tweetOutputFile = '../data/remote_tweets.csv'
tweetFieldNames = ['language', 'conversation_id', 'retweets_count', 'mentions', 'created_at', 
              'likes_count', 'possibly_sensitive', 'in_reply_to_user_id', '_id', 'source', 
              'user_id', 'in_reply_to_status_id', 'tweet', 'replies_count', 'quoted_status_id', 'hashtags', 'cashtags', 'urls']
ignoredTweetFields = ['thumbnail', 'photos', 'videos']

## Tweets

In [5]:
extractTwitterData(tweetOutputFile, tweetFieldNames, ignoredTweetFields, tweetQueryBody)

45359it [00:24, 1820.18it/s]

number of rows: 45359





Iteration #1: 43441
<br>
Iteration #2: 45359

## Users

In [6]:
userQueryBody = {}
userOutputFile = '../data/twitter_users.csv'
userFields = ['_id', 'username', 'created_at', 'tweets', 'following', 'followers', 'likes', 'media_count', 'verified']
ignoredUserFields = ['bio', 'profile_image_url', 'background_image', 'pinned_status_ids']

In [9]:
extractTwitterData(userOutputFile, userFields, ignoredUserFields, userQueryBody, 'users')

7520298it [34:03, 3680.70it/s] 

number of rows: 7520298





# Cleaning

In [7]:
df_list = []

chunksize = 10 ** 5
with pd.read_csv(tweetOutputFile, chunksize=chunksize) as reader:
    for chunk in tqdm(reader):
        chunk = chunk[chunk.language != 'language']
        chunk.created_at = pd.to_datetime(chunk.created_at)
        
        del chunk['language']
        del chunk['possibly_sensitive']

        chunk.replies_count = pd.to_numeric(chunk.replies_count)
        chunk.retweets_count = pd.to_numeric(chunk.retweets_count)
        chunk.likes_count = pd.to_numeric(chunk.likes_count)
        df_list.append(chunk)

  for obj in iterable:
1it [00:00,  2.34it/s]


In [8]:
result = pd.concat(df_list)
result

Unnamed: 0,conversation_id,retweets_count,mentions,created_at,likes_count,in_reply_to_user_id,_id,source,user_id,in_reply_to_status_id,tweet,replies_count,quoted_status_id,hashtags,cashtags,urls
0,1375557622410149894,2,,2021-03-26 21:23:21+00:00,111,5.720514e+08,1375559020451356674,Twitter for iPhone,572051429,1.375558e+18,به من گزارش میده حقوقش میدونم سالی ۸۵تا، همسرش...,3,,,,
1,1375559420176908296,3,,2021-03-26 21:24:57+00:00,163,,1375559420176908296,Twitter for iPhone,812764546012246017,,اینایی که رفتن سفر، مهمونی و... هیچ پروتکلی هم...,1,,,,
2,1383828354197979136,2,,2021-04-18 17:02:44+00:00,37,,1383828354197979136,Twitter for Android,985101690536898565,,دارم فکر میکنم که چقدر خسته ام با اینکه امروز ...,3,,,,
3,1383840315031781385,0,,2021-04-18 17:50:16+00:00,11,,1383840315031781385,Twitter Web App,310871673,,یک سال و نیمه از قبل آبان ۹۸ دورکار شدم تا الان,1,1.383657e+18,,,
4,1383820625354002439,1,['15782990'],2021-04-18 17:31:42+00:00,9,1.578299e+07,1383835643239747593,Twitter Web App,1271095835028307969,1.383821e+18,@shahrzaad حتمن میرفتم به شهری که به اقوام یا ...,1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45354,1434127275587428352,0,['1273546506012897281'],2021-09-04 12:46:01+00:00,3,1.273547e+18,1434135662396018689,Twitter for Android,1260775960267444225,1.434127e+18,@anar_khanoom آخ لجم میگیره از این همه کثافت د...,0,,,,
45355,1434127961943326723,0,['1251428296589488129'],2021-09-04 12:21:54+00:00,2,1.251428e+18,1434129593036939267,Twitter for Android,1214804130096668672,1.434128e+18,@Nabinabia صرفا جهت اطلاع، ۶ ماهه رزومه فرستاد...,1,,,,
45356,1434138057071112194,0,,2021-09-04 12:55:32+00:00,4,,1434138057071112194,Twitter for iPhone,1147481863138811905,,اگه خواهرم این کار رو استخدام بشه، دورکاری خوا...,0,,,,
45357,1434133064750338048,0,,2021-09-04 12:35:42+00:00,18,,1434133064750338048,Twitter for Android,1106832761216815104,,دورکاری https://t.co/GYWHxcI9Z4,0,,,,


In [9]:
result.to_csv(tweetOutputFile, index=False)