## Install sqlite
download and install the sqlite from https://www.sqlite.org/download.html

or if you are using conda env run the command: conda install -c anaconda sqlite

## Note:
### db naming convention:
#### project_s +  summary + v_version + timestamp
    

e.g.
'project_s_sample_v0.0_20210221'

# Step 1 data preprocessing

In [1]:
import pandas as pd
import sqlite3
from pathlib import Path

## twitter

In [2]:
twitter_netflix_df = pd.read_csv("../data/twitter_kw_netflix.csv")
twitter_netflix_df.head(2)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,full text,time created,user_id,tweet_id,location,followers_count,statuses_count,verified,description,reply_count,retweet_count,favorites_count,retweeted,hashtags
0,Is anyone else watching #ginnyandgeorgia on Ne...,Sat Feb 27 23:59:59 +0000 2021,64722346\t,1365813966090481665\t,OHIO,135,3642,False,0,0,,False,ginnyandgeorgia,
1,@tofumisox @netflix For serious scaries: The C...,Sat Feb 27 23:59:58 +0000 2021,91931793\t,1365813962114129921\t,"San Francisco, CA",2593,6966,False,1,0,,False,,


In [3]:
twitter_netflix_df.columns

Index(['full text', 'time created', 'user_id', 'tweet_id', 'location',
       'followers_count', 'statuses_count', 'verified', 'description',
       'reply_count', 'retweet_count', 'favorites_count', 'retweeted',
       'hashtags'],
      dtype='object')

In [4]:
# preprocess df
final_columns = ['user_id',
                 'tweet_id', 
                 'time_created',
                 'reduced_text', 
                 'full_text', 
                 'location',
                 'followers_count',
                 'statuses_count',
                 'verified',
                 'description',
                 'reply_count',
                 'retweet_count',
                 'favorites_count',
                 'retweeted',
                 'hashtags']
twitter_netflix_df.rename(columns={'full text':'full_text',
                                   'time created':'time_created'}, inplace=True)
twitter_netflix_df['reduced_text'] = twitter_netflix_df['full_text'].apply(lambda x: x[:255])

twitter_netflix_df = twitter_netflix_df[final_columns]
twitter_netflix_df.head(2)

Unnamed: 0,user_id,tweet_id,time_created,reduced_text,full_text,location,followers_count,statuses_count,verified,description,reply_count,retweet_count,favorites_count,retweeted,hashtags
0,64722346\t,1365813966090481665\t,Sat Feb 27 23:59:59 +0000 2021,Is anyone else watching #ginnyandgeorgia on Ne...,Is anyone else watching #ginnyandgeorgia on Ne...,OHIO,135,3642,False,0,0,,False,ginnyandgeorgia,
1,91931793\t,1365813962114129921\t,Sat Feb 27 23:59:58 +0000 2021,@tofumisox @netflix For serious scaries: The C...,@tofumisox @netflix For serious scaries: The C...,"San Francisco, CA",2593,6966,False,1,0,,False,,


In [6]:
## check retweet values
t = pd.read_csv("../data/twitter_kw_netflix.csv")

t[~t.retweet_count.isnull()]

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,full text,time created,user_id,tweet_id,location,followers_count,statuses_count,verified,description,reply_count,retweet_count,favorites_count,retweeted,hashtags
92,full text,time created,user_id,tweet_id,location,followers_count,statuses_count,verified,description,reply_count,retweet_count,favorites_count,retweeted,hashtags
2918,full text,time created,user_id,tweet_id,location,followers_count,statuses_count,verified,description,reply_count,retweet_count,favorites_count,retweeted,hashtags
6663,full text,time created,user_id,tweet_id,location,followers_count,statuses_count,verified,description,reply_count,retweet_count,favorites_count,retweeted,hashtags
11341,full text,time created,user_id,tweet_id,location,followers_count,statuses_count,verified,description,reply_count,retweet_count,favorites_count,retweeted,hashtags
17870,full text,time created,user_id,tweet_id,location,followers_count,statuses_count,verified,description,reply_count,retweet_count,favorites_count,retweeted,hashtags
27158,full text,time created,user_id,tweet_id,location,followers_count,statuses_count,verified,description,reply_count,retweet_count,favorites_count,retweeted,hashtags
39133,full text,time created,user_id,tweet_id,location,followers_count,statuses_count,verified,description,reply_count,retweet_count,favorites_count,retweeted,hashtags
53868,full text,time created,user_id,tweet_id,location,followers_count,statuses_count,verified,description,reply_count,retweet_count,favorites_count,retweeted,hashtags


## kaggle

In [7]:
kaggle_movies_df = pd.read_csv("../data/kaggle_movies.csv")
kaggle_movies_df.head(2)

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type,Directors,Genres,Country,Language,Runtime
0,0,1,Inception,2010,13+,8.8,87%,1,0,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0
1,1,2,The Matrix,1999,18+,8.7,87%,1,0,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0


In [8]:
# preprocess df
km_columns = ['id',
             'title',
             'year',
             'age',
             'imdb',
             'rotten_tomatoes',
             'netflix',
             'hulu',
             'prime_video',
             'disney',
             'type',
             'directors',
             'genres',
             'country',
             'language',
             'runtime']

kaggle_movies_df.columns = ["_".join(i.lower().strip("+").split(" ")) for i in kaggle_movies_df.columns]

kaggle_movies_df = kaggle_movies_df[km_columns]

def convert_tomato_score(x):
    if type(x) is float:
        return x
    else:
        return float(x.strip("%"))
    
def convert_age(x):
    if type(x) is float:
        return x
    elif x == 'all':
        return 0
    else:
        return int(x.strip("+"))


kaggle_movies_df['rotten_tomatoes'] = kaggle_movies_df['rotten_tomatoes'].apply(lambda x: convert_tomato_score(x))
kaggle_movies_df['age'] = kaggle_movies_df['age'].apply(lambda x: convert_age(x))

kaggle_movies_df.head(2)

Unnamed: 0,id,title,year,age,imdb,rotten_tomatoes,netflix,hulu,prime_video,disney,type,directors,genres,country,language,runtime
0,1,Inception,2010,13.0,8.8,87.0,1,0,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0
1,2,The Matrix,1999,18.0,8.7,87.0,1,0,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0


In [9]:
kaggle_tvshows_df = pd.read_csv("../data/kaggle_tvshows.csv")
kaggle_tvshows_df.head(2)

Unnamed: 0.1,Unnamed: 0,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,type
0,0,Breaking Bad,2008,18+,9.5,96%,1,0,0,0,1
1,1,Stranger Things,2016,16+,8.8,93%,1,0,0,0,1


In [10]:
# preprocess df
ktv_columns = [ 'title',
                 'year',
                 'age',
                 'imdb',
                 'rotten_tomatoes',
                 'netflix',
                 'hulu',
                 'prime_video',
                 'disney',
                 'type']

kaggle_tvshows_df.columns = ["_".join(i.lower().strip("+").split(" ")) for i in kaggle_tvshows_df.columns]

kaggle_tvshows_df = kaggle_tvshows_df[ktv_columns]


# apply convert function
kaggle_tvshows_df['rotten_tomatoes'] = kaggle_tvshows_df['rotten_tomatoes'].apply(lambda x: convert_tomato_score(x))
kaggle_tvshows_df['age'] = kaggle_tvshows_df['age'].apply(lambda x: convert_age(x))


kaggle_tvshows_df.head(2)

Unnamed: 0,title,year,age,imdb,rotten_tomatoes,netflix,hulu,prime_video,disney,type
0,Breaking Bad,2008,18.0,9.5,96.0,1,0,0,0,1
1,Stranger Things,2016,16.0,8.8,93.0,1,0,0,0,1


# Step 2 create db and table

In [11]:
## TODO: change this dev db_name according to the naming convention described above
db_name = 'project_s_prototype_v0.1_20210228.db'
table_name_tweet = 'twitter_netflix'
Path(db_name).touch()

In [12]:
## TODO: change this dev db_name according to the naming convention described above
db_name = 'project_s_prototype_v0.1_20210228.db'
table_name_km = 'kaggle_movies'
Path(db_name).touch()

In [13]:
## TODO: change this dev db_name according to the naming convention described above
db_name = 'project_s_prototype_v0.1_20210228.db'
table_name_ktv = 'kaggle_tvshows'
Path(db_name).touch()

### initiate Sqlite table

In [14]:
conn = sqlite3.connect(db_name)
c = conn.cursor()

In [15]:
twitter_netflix_df.dtypes

user_id            object
tweet_id           object
time_created       object
reduced_text       object
full_text          object
location           object
followers_count    object
statuses_count     object
verified           object
description        object
reply_count        object
retweet_count      object
favorites_count    object
retweeted          object
hashtags           object
dtype: object

In [16]:
# initialize schema
c.execute('''CREATE TABLE IF NOT EXISTS {table_name}
                         (user_id int, 
                          tweet_id int,
                          time_created timestamp,
                          reduced_text varchar,
                          full_text text,
                          location varchar(32),
                          followers_count int,
                          statuses_count int,
                          verified BOOLEAN DEFAULT(FALSE),
                          description varchar(32),
                          reply_count int,
                          retweet_count int,
                          favorites_count int,
                          retweeted varchar(32),
                          hashtags varchar(32)
                          )'''.format(table_name = table_name_tweet))

<sqlite3.Cursor at 0x7f481a9621f0>

In [17]:
kaggle_movies_df.dtypes

id                   int64
title               object
year                 int64
age                float64
imdb               float64
rotten_tomatoes    float64
netflix              int64
hulu                 int64
prime_video          int64
disney               int64
type                 int64
directors           object
genres              object
country             object
language            object
runtime            float64
dtype: object

In [18]:
# initialize schema
c.execute('''CREATE TABLE IF NOT EXISTS {table_name}
                         (id int, 
                          title varchar,
                          year int,
                          age int,
                          imdb float,
                          rotten_tomatoes float,
                          netflix int,
                          hulu int,
                          prime_video int,
                          disney int,
                          type int,
                          directors varchar,
                          genres varchar,
                          country varchar,
                          language varchar,
                          runtime float
                          )'''.format(table_name = table_name_km))

<sqlite3.Cursor at 0x7f481a9621f0>

In [19]:
# initialize schema
c.execute('''CREATE TABLE IF NOT EXISTS {table_name}
                         (title varchar,
                          year int,
                          age int,
                          imdb float,
                          rotten_tomatoes float,
                          netflix int,
                          hulu int,
                          prime_video int,
                          disney int,
                          type int
                          )'''.format(table_name = table_name_ktv))

<sqlite3.Cursor at 0x7f481a9621f0>

### write to sql table

In [20]:
# convert csv to table
twitter_netflix_df.to_sql('twitter_netflix', 
                          conn, 
                          if_exists='append',
                          index=False)

In [21]:
# convert csv to table
kaggle_movies_df.to_sql('kaggle_movies', 
                          conn, 
                          if_exists='append',
                          index=False)

In [22]:
# convert csv to table
kaggle_tvshows_df.to_sql('kaggle_tvshows', 
                          conn, 
                          if_exists='append',
                          index=False)

# Query data from sqlite table

In [23]:
q = """
    SELECT * FROM {table_name}
    """
tweet_df = pd.read_sql(q.format(table_name = table_name_tweet), conn)
tweet_df.head(2)

Unnamed: 0,user_id,tweet_id,time_created,reduced_text,full_text,location,followers_count,statuses_count,verified,description,reply_count,retweet_count,favorites_count,retweeted,hashtags
0,64722346,1365813966090481665,Sat Feb 27 23:59:59 +0000 2021,Is anyone else watching #ginnyandgeorgia on Ne...,Is anyone else watching #ginnyandgeorgia on Ne...,OHIO,135,3642,False,0,0,,False,ginnyandgeorgia,
1,91931793,1365813962114129921,Sat Feb 27 23:59:58 +0000 2021,@tofumisox @netflix For serious scaries: The C...,@tofumisox @netflix For serious scaries: The C...,"San Francisco, CA",2593,6966,False,1,0,,False,,


In [24]:
twitter_netflix_df[~twitter_netflix_df.retweet_count.isnull()]

Unnamed: 0,user_id,tweet_id,time_created,reduced_text,full_text,location,followers_count,statuses_count,verified,description,reply_count,retweet_count,favorites_count,retweeted,hashtags
92,user_id,tweet_id,time created,full text,full text,location,followers_count,statuses_count,verified,description,reply_count,retweet_count,favorites_count,retweeted,hashtags
2918,user_id,tweet_id,time created,full text,full text,location,followers_count,statuses_count,verified,description,reply_count,retweet_count,favorites_count,retweeted,hashtags
6663,user_id,tweet_id,time created,full text,full text,location,followers_count,statuses_count,verified,description,reply_count,retweet_count,favorites_count,retweeted,hashtags
11341,user_id,tweet_id,time created,full text,full text,location,followers_count,statuses_count,verified,description,reply_count,retweet_count,favorites_count,retweeted,hashtags
17870,user_id,tweet_id,time created,full text,full text,location,followers_count,statuses_count,verified,description,reply_count,retweet_count,favorites_count,retweeted,hashtags
27158,user_id,tweet_id,time created,full text,full text,location,followers_count,statuses_count,verified,description,reply_count,retweet_count,favorites_count,retweeted,hashtags
39133,user_id,tweet_id,time created,full text,full text,location,followers_count,statuses_count,verified,description,reply_count,retweet_count,favorites_count,retweeted,hashtags
53868,user_id,tweet_id,time created,full text,full text,location,followers_count,statuses_count,verified,description,reply_count,retweet_count,favorites_count,retweeted,hashtags


In [25]:
q = """
    SELECT * FROM {table_name}
    """
km_df = pd.read_sql(q.format(table_name = table_name_km), conn)
km_df.head(2)

Unnamed: 0,id,title,year,age,imdb,rotten_tomatoes,netflix,hulu,prime_video,disney,type,directors,genres,country,language,runtime
0,1,Inception,2010,13.0,8.8,87.0,1,0,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0
1,2,The Matrix,1999,18.0,8.7,87.0,1,0,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0


In [26]:
q = """
    SELECT * FROM {table_name}
    """
ktv_df = pd.read_sql(q.format(table_name = table_name_ktv), conn)
ktv_df.head(2)

Unnamed: 0,title,year,age,imdb,rotten_tomatoes,netflix,hulu,prime_video,disney,type
0,Breaking Bad,2008,18.0,9.5,96.0,1,0,0,0,1
1,Stranger Things,2016,16.0,8.8,93.0,1,0,0,0,1


### Delete table

In [27]:
## start sqlite in terminal
## navigate to the correct db
## type in the terminal: drop table <table_name>