In [1]:
import pandas as pd
import glob

## Extract

In [2]:
#Manually extracts similar .csv files with no errors. If the .csv files have different columns it will error out.
df = pd.concat(map(pd.read_csv, glob.glob('test_input' + '\*.csv')))
#Extract All source code
#https://stackoverflow.com/questions/20906474/import-multiple-csv-files-into-pandas-and-concatenate-into-one-dataframe

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [3]:
df.head()

Unnamed: 0,City,City_ID,Cloudiness,Country,DEATHS,Date,Humidity,Lat,Lng,Max Temp,...,rentals_handgun,rentals_long_gun,return_to_seller_handgun,return_to_seller_long_gun,return_to_seller_other,returned_handgun,returned_long_gun,returned_other,state,totals
0,jacareacanga,0.0,0.0,BR,,1528902000.0,62.0,-6.22,-57.76,89.6,...,,,,,,,,,,
1,kaitangata,1.0,100.0,NZ,,1528905000.0,94.0,-46.28,169.85,42.61,...,,,,,,,,,,
2,goulburn,2.0,20.0,AU,,1528905000.0,91.0,-34.75,149.72,44.32,...,,,,,,,,,,
3,lata,3.0,76.0,IN,,1528905000.0,89.0,30.78,78.62,59.89,...,,,,,,,,,,
4,chokurdakh,4.0,0.0,RU,,1528905000.0,88.0,70.62,147.9,32.17,...,,,,,,,,,,


In [4]:
len(df)

15437

## Transform

In [5]:
df['hash'] = pd.Series((hash(tuple(row)) for _, row in df.iterrows()))
#Hash Code
#https://stackoverflow.com/questions/25757042/create-hash-value-for-each-row-of-data-with-selected-columns-in-dataframe-in-pyt
df.head()

Unnamed: 0,City,City_ID,Cloudiness,Country,DEATHS,Date,Humidity,Lat,Lng,Max Temp,...,rentals_long_gun,return_to_seller_handgun,return_to_seller_long_gun,return_to_seller_other,returned_handgun,returned_long_gun,returned_other,state,totals,hash
0,jacareacanga,0.0,0.0,BR,,1528902000.0,62.0,-6.22,-57.76,89.6,...,,,,,,,,,,-4621756264185551560
1,kaitangata,1.0,100.0,NZ,,1528905000.0,94.0,-46.28,169.85,42.61,...,,,,,,,,,,4307043627102770079
2,goulburn,2.0,20.0,AU,,1528905000.0,91.0,-34.75,149.72,44.32,...,,,,,,,,,,-1231273703198813820
3,lata,3.0,76.0,IN,,1528905000.0,89.0,30.78,78.62,59.89,...,,,,,,,,,,7578054796355573265
4,chokurdakh,4.0,0.0,RU,,1528905000.0,88.0,70.62,147.9,32.17,...,,,,,,,,,,5482800537892913498


In [6]:
len(df)

15437

In [7]:
df_nodupe = df[~df.duplicated()]
#Dedupe code source
#https://stackoverflow.com/questions/40438237/assign-hash-to-row-of-categorical-data-in-pandas

In [8]:
df_nodupe.head()

Unnamed: 0,City,City_ID,Cloudiness,Country,DEATHS,Date,Humidity,Lat,Lng,Max Temp,...,rentals_long_gun,return_to_seller_handgun,return_to_seller_long_gun,return_to_seller_other,returned_handgun,returned_long_gun,returned_other,state,totals,hash
0,jacareacanga,0.0,0.0,BR,,1528902000.0,62.0,-6.22,-57.76,89.6,...,,,,,,,,,,-4621756264185551560
1,kaitangata,1.0,100.0,NZ,,1528905000.0,94.0,-46.28,169.85,42.61,...,,,,,,,,,,4307043627102770079
2,goulburn,2.0,20.0,AU,,1528905000.0,91.0,-34.75,149.72,44.32,...,,,,,,,,,,-1231273703198813820
3,lata,3.0,76.0,IN,,1528905000.0,89.0,30.78,78.62,59.89,...,,,,,,,,,,7578054796355573265
4,chokurdakh,4.0,0.0,RU,,1528905000.0,88.0,70.62,147.9,32.17,...,,,,,,,,,,5482800537892913498


In [9]:
len(df_nodupe)

14589

## Load

#### > Convert DataFrame to json formatted file

In [10]:
import json

In [11]:
df_nodupe.to_json('ETL_json_file', orient='index')
# Converting dataframe data to json format
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html

ValueError: DataFrame index must be unique for orient='index'.

In [None]:
json_data = df_nodupe.to_json(orient='index')
parsed = json.loads(json_data)
# print(json.dumps(parsed, indent=4, sort_keys=True))
# Un-comment print to pretty print json formated data

#### > Create MongoDB named 'etl_db' with a collection 'etl_data'

In [None]:
import pymongo
from pymongo import MongoClient
client = MongoClient()
client = MongoClient('localhost', 27017)
client = MongoClient('mongodb://localhost:27017/')
print('Mongo version', pymongo.__version__)

In [None]:
# Uncomment code to drop existing database before creating new one
# client.drop_database('etl_db')

In [None]:
db = client['etl_db']
# Create database named 'etl_db'

db.etl_data.drop() # Drop collection if it already exist

collection = db['etl_data']
# Create collection called 'etl_data'

#### > Load ETL_json_file data

In [None]:
with open('ETL_json_file') as f:
    file_data = json.load(f)
    collection.insert_one(file_data) 
client.close()
# Read json file into mongoDB
# https://stackoverflow.com/questions/49510049/how-to-import-json-file-to-mongodb-using-python

In [None]:
db.list_collection_names()
# List collections in database

In [None]:
db.etl_data.find( { 0: {} } )


In [None]:
# cursor = db.etl_data # selecting the etl_data collection
# for document in cursor.find():
#    print (document)
# Printing the content of the collection LARGE PRINTOUT

#### > Reading content of database and loading it into a dataframe

In [None]:
extracted_etl_data = db.etl_data
df = pd.DataFrame(list(extracted_etl_data.find()))
# https://stackoverflow.com/questions/16249736/how-to-import-data-from-mongodb-to-pandas

In [None]:
df

In [None]:
df = df.melt()

In [None]:
df.head()

In [None]:
len(df)

In [None]:
df.iloc[0][1]
# Showing content of first row 'value'

#### > CSV Move script - moves .csv files after being ETL processed

In [None]:
import shutil, os, glob

In [None]:
#For Loop moving all .csv files from the test_input folder 
for filePath in glob.glob('test_input' + '\*.csv'):
            #Move each file to the destination directory
        shutil.move(filePath, 'test_processed');
# Move all files that have been ETL'ed into a processed folder
#Source Code           
#https://thispointer.com/python-how-to-move-files-and-directories/