In [1]:
import numpy as np
import pandas as pd
import re
from unidecode import unidecode

In [2]:
def clean_string(s):
    s = unidecode(s)
    #Replace newlines with spaces
    s = s.replace("\n"," ")
    #Delete handles
    s = re.sub(r"@(.+?)\b:*","",s)
    #Delete "RT"
    s = re.sub(r"RT","",s)
    #Delete hashtags
    s = re.sub(r"#(.+?)\b","",s)
    #Delete hyperlinks
    s = re.sub(r"http(.+?)(?:$|\s)","",s)
    s = s.strip()
    return s

In [3]:
def cleanDataframe(df):
    print(f"Before:\t{df.shape}")
    df = df.copy()

    #Remove unnecessary columns
    df=df[["created_at","text","truncated","lang"]]

    #Drop rows with missing text
    df.dropna(subset=["text"], inplace=True)

    #Drop rows with truncated text
    df = df.loc[df["truncated"]==0]
    df.shape

    #Drop all non-English entries
    df = df.loc[df["lang"]=="en"]
    df.shape

    #Drop the now-unnecessary columns
    df = df[["created_at","text"]]

    #Clean the strings
    df["text"] = df["text"].apply(clean_string)
    df.sample(10, random_state=11)

    #Reset the index
    df.reset_index(drop=True,inplace=True)
    
    print(f"After:\t{df.shape}")
    return df

In [4]:
import os

#Select 4 hours of tweets: hours 0, 6, 12, and 18
basePath = "2020_06_11"
directories = ["00", "06", "12", "18"]
paths = [os.path.join(basePath,d) for d in directories]

df_big = pd.DataFrame()
for path in paths:
    for file_name in os.listdir(path):
        print(os.path.join(path,file_name)
        df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
        df_big = pd.concat([df_big, cleanDataframe(df)])
        print(f"Full shape: {df_big.shape}")

30.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3239, 37)
After:	(790, 2)
Full shape: (790, 2)
51.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3468, 38)
After:	(752, 2)
Full shape: (1542, 2)
54.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3018, 37)
After:	(732, 2)
Full shape: (2274, 2)
55.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(2947, 37)
After:	(728, 2)
Full shape: (3002, 2)
58.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(2974, 37)
After:	(726, 2)
Full shape: (3728, 2)
33.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3295, 38)
After:	(773, 2)
Full shape: (4501, 2)
48.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(2906, 37)
After:	(757, 2)
Full shape: (5258, 2)
56.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(2868, 38)
After:	(725, 2)
Full shape: (5983, 2)
43.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3241, 37)
After:	(694, 2)
Full shape: (6677, 2)
47.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(2960, 37)
After:	(774, 2)
Full shape: (7451, 2)
42.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3380, 37)
After:	(747, 2)
Full shape: (8198, 2)
31.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3005, 37)
After:	(755, 2)
Full shape: (8953, 2)
52.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3016, 37)
After:	(682, 2)
Full shape: (9635, 2)
35.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3052, 37)
After:	(724, 2)
Full shape: (10359, 2)
40.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(2919, 37)
After:	(743, 2)
Full shape: (11102, 2)
46.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3004, 37)
After:	(717, 2)
Full shape: (11819, 2)
49.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3020, 37)
After:	(709, 2)
Full shape: (12528, 2)
59.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(2882, 37)
After:	(742, 2)
Full shape: (13270, 2)
45.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3049, 37)
After:	(720, 2)
Full shape: (13990, 2)
53.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3185, 38)
After:	(758, 2)
Full shape: (14748, 2)
50.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3396, 37)
After:	(747, 2)
Full shape: (15495, 2)
29.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3326, 38)
After:	(810, 2)
Full shape: (16305, 2)
34.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3471, 38)
After:	(778, 2)
Full shape: (17083, 2)
41.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3374, 37)
After:	(705, 2)
Full shape: (17788, 2)
44.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3291, 37)
After:	(710, 2)
Full shape: (18498, 2)
57.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(2865, 37)
After:	(701, 2)
Full shape: (19199, 2)
37.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3042, 37)
After:	(751, 2)
Full shape: (19950, 2)
36.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3039, 37)
After:	(766, 2)
Full shape: (20716, 2)
32.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3162, 37)
After:	(791, 2)
Full shape: (21507, 2)
39.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3018, 37)
After:	(741, 2)
Full shape: (22248, 2)
38.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(2988, 37)
After:	(725, 2)
Full shape: (22973, 2)
08.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4495, 38)
After:	(921, 2)
Full shape: (23894, 2)
28.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4695, 38)
After:	(989, 2)
Full shape: (24883, 2)
26.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4621, 37)
After:	(926, 2)
Full shape: (25809, 2)
30.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4735, 38)
After:	(1053, 2)
Full shape: (26862, 2)
04.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4471, 37)
After:	(905, 2)
Full shape: (27767, 2)
06.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4413, 38)
After:	(921, 2)
Full shape: (28688, 2)
51.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4527, 37)
After:	(1038, 2)
Full shape: (29726, 2)
54.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4659, 37)
After:	(1051, 2)
Full shape: (30777, 2)
55.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4555, 38)
After:	(1055, 2)
Full shape: (31832, 2)
58.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4913, 37)
After:	(1013, 2)
Full shape: (32845, 2)
33.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4731, 37)
After:	(967, 2)
Full shape: (33812, 2)
09.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4451, 37)
After:	(986, 2)
Full shape: (34798, 2)
11.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4326, 37)
After:	(960, 2)
Full shape: (35758, 2)
21.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4490, 37)
After:	(931, 2)
Full shape: (36689, 2)
03.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4510, 37)
After:	(940, 2)
Full shape: (37629, 2)
48.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4475, 37)
After:	(962, 2)
Full shape: (38591, 2)
56.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4476, 37)
After:	(989, 2)
Full shape: (39580, 2)
05.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4605, 37)
After:	(913, 2)
Full shape: (40493, 2)
24.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4630, 37)
After:	(949, 2)
Full shape: (41442, 2)
43.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4669, 37)
After:	(946, 2)
Full shape: (42388, 2)
47.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4485, 37)
After:	(991, 2)
Full shape: (43379, 2)
27.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4298, 38)
After:	(964, 2)
Full shape: (44343, 2)
42.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4650, 37)
After:	(1033, 2)
Full shape: (45376, 2)
19.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4455, 37)
After:	(946, 2)
Full shape: (46322, 2)
25.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4573, 37)
After:	(973, 2)
Full shape: (47295, 2)
01.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4583, 37)
After:	(941, 2)
Full shape: (48236, 2)
02.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4588, 37)
After:	(886, 2)
Full shape: (49122, 2)
00.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4721, 38)
After:	(1025, 2)
Full shape: (50147, 2)
31.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4622, 37)
After:	(1072, 2)
Full shape: (51219, 2)
23.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4579, 37)
After:	(920, 2)
Full shape: (52139, 2)
52.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4554, 37)
After:	(1079, 2)
Full shape: (53218, 2)
13.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4522, 37)
After:	(981, 2)
Full shape: (54199, 2)
14.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4831, 37)
After:	(953, 2)
Full shape: (55152, 2)
35.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4791, 37)
After:	(1001, 2)
Full shape: (56153, 2)
20.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4649, 37)
After:	(962, 2)
Full shape: (57115, 2)
40.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4727, 37)
After:	(987, 2)
Full shape: (58102, 2)
46.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4531, 37)
After:	(1006, 2)
Full shape: (59108, 2)
49.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4770, 37)
After:	(996, 2)
Full shape: (60104, 2)
18.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4566, 37)
After:	(989, 2)
Full shape: (61093, 2)
59.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4627, 37)
After:	(960, 2)
Full shape: (62053, 2)
45.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4860, 37)
After:	(992, 2)
Full shape: (63045, 2)
22.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4511, 37)
After:	(963, 2)
Full shape: (64008, 2)
16.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4480, 38)
After:	(969, 2)
Full shape: (64977, 2)
53.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4632, 37)
After:	(1025, 2)
Full shape: (66002, 2)
15.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4546, 37)
After:	(920, 2)
Full shape: (66922, 2)
50.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4569, 37)
After:	(1036, 2)
Full shape: (67958, 2)
17.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4433, 37)
After:	(929, 2)
Full shape: (68887, 2)
29.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4627, 37)
After:	(936, 2)
Full shape: (69823, 2)
34.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4801, 37)
After:	(1007, 2)
Full shape: (70830, 2)
12.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4477, 38)
After:	(1000, 2)
Full shape: (71830, 2)
41.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4657, 37)
After:	(1008, 2)
Full shape: (72838, 2)
44.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4624, 37)
After:	(987, 2)
Full shape: (73825, 2)
57.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4649, 37)
After:	(1000, 2)
Full shape: (74825, 2)
37.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4628, 37)
After:	(959, 2)
Full shape: (75784, 2)
36.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4627, 37)
After:	(971, 2)
Full shape: (76755, 2)
32.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4730, 38)
After:	(953, 2)
Full shape: (77708, 2)
10.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4468, 37)
After:	(900, 2)
Full shape: (78608, 2)
39.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4715, 37)
After:	(1019, 2)
Full shape: (79627, 2)
38.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4688, 37)
After:	(987, 2)
Full shape: (80614, 2)
07.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4587, 37)
After:	(988, 2)
Full shape: (81602, 2)
08.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(8508, 37)
After:	(1204, 2)
Full shape: (82806, 2)
28.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4000, 38)
After:	(1230, 2)
Full shape: (84036, 2)
26.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4270, 37)
After:	(1277, 2)
Full shape: (85313, 2)
30.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4104, 37)
After:	(1284, 2)
Full shape: (86597, 2)
04.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(8291, 37)
After:	(1203, 2)
Full shape: (87800, 2)
06.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(8635, 37)
After:	(1278, 2)
Full shape: (89078, 2)
51.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3885, 37)
After:	(1291, 2)
Full shape: (90369, 2)
54.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3726, 38)
After:	(1214, 2)
Full shape: (91583, 2)
55.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3684, 38)
After:	(1137, 2)
Full shape: (92720, 2)
58.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3519, 37)
After:	(1177, 2)
Full shape: (93897, 2)
33.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4283, 37)
After:	(1249, 2)
Full shape: (95146, 2)
09.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(8528, 37)
After:	(1278, 2)
Full shape: (96424, 2)
11.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(8441, 37)
After:	(1224, 2)
Full shape: (97648, 2)
21.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(7788, 37)
After:	(1262, 2)
Full shape: (98910, 2)
03.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(8683, 37)
After:	(1313, 2)
Full shape: (100223, 2)
48.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3885, 38)
After:	(1186, 2)
Full shape: (101409, 2)
56.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3808, 37)
After:	(1157, 2)
Full shape: (102566, 2)
05.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(8391, 37)
After:	(1262, 2)
Full shape: (103828, 2)
24.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(7630, 37)
After:	(1224, 2)
Full shape: (105052, 2)
43.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3816, 37)
After:	(1277, 2)
Full shape: (106329, 2)
47.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3661, 37)
After:	(1207, 2)
Full shape: (107536, 2)
27.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4020, 37)
After:	(1240, 2)
Full shape: (108776, 2)
42.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3883, 37)
After:	(1253, 2)
Full shape: (110029, 2)
19.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(8012, 37)
After:	(1273, 2)
Full shape: (111302, 2)
25.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(5960, 37)
After:	(1241, 2)
Full shape: (112543, 2)
01.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(6573, 38)
After:	(1245, 2)
Full shape: (113788, 2)
02.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(8571, 37)
After:	(1272, 2)
Full shape: (115060, 2)
00.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(4361, 37)
After:	(1449, 2)
Full shape: (116509, 2)
31.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3913, 37)
After:	(1286, 2)
Full shape: (117795, 2)
23.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(7623, 37)
After:	(1294, 2)
Full shape: (119089, 2)
52.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(3819, 38)
After:	(1330, 2)
Full shape: (120419, 2)
13.json.bz2


  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])
  df = pd.read_json(os.path.join(path,file_name), lines=True, convert_dates=["created_at","timestamp_ms"])


Before:	(8441, 37)
After:	(1242, 2)
Full shape: (121661, 2)
14.json.bz2


KeyboardInterrupt: 

In [5]:
df_big.to_csv("twitter_stream.csv")