## BigData101 :
## From Data to Big Data



#### Import libraries

In [None]:
import pandas as pd
import requests
import csv
from pymongo import MongoClient
import matplotlib.pyplot as plt

#### Extract data

In [None]:
url='http://api.coincap.io/v2/assets'

In [None]:
header={"Content-type":"application/json",
        "Accept-Encoding":"deflate"}

In [None]:
response = requests.get(url,headers=header)
print(response)

responseData= response.json()

In [None]:
df= pd.json_normalize(responseData,'data')
print(df)

In [None]:
df.to_csv(r'./APIdata.csv')

#### Mongodb Connection

In [None]:
client = MongoClient("mongodb://localhost:27017/")

In [None]:
# Create a new database named 'itcDB'
db = client.itcDB

In [None]:
# Specify the collection name
collection_name = 'data'

In [None]:
# Select or create the collection
collection = db[collection_name]

In [None]:
print(client.list_database_names())

#### Loading

In [None]:
csv_file_path = '/home/tarek/Documents/codingStuffs/BigData101/DataITC.csv'

In [None]:
with open(csv_file_path, 'r') as file:
    # Create a CSV reader
    csv_reader = csv.DictReader(file)

    # Iterate through each row in the CSV file
    for row in csv_reader:
        # Insert each row as a document in the MongoDB collection
        collection.insert_one(row)

In [None]:
print(f'Data from {csv_file_path} has been inserted into the {collection_name} collection in the itcDB database.')

In [None]:
print(client.list_database_names())

In [None]:
first_document = collection.find_one()
print(first_document)

In [None]:
df = pd.DataFrame(list(collection.find()))

In [None]:
df.head()

#### Transformation

In [None]:
df.columns

In [None]:
columns_to_delete = ['premiere tentative', 'date premiere tentative', 'deuxieme tentative',
                      'date deuxieme tentative', 'troisieme tentative', 'date troisieme tentative']

# Drop specified columns
df = df.drop(columns=columns_to_delete, errors='ignore')

In [None]:
df.columns

In [None]:
df['full_name'] = df['prenom'] + ' ' + df['nom']

# Drop the original 'prenom' and 'nom' columns
df = df.drop(columns=['prenom', 'nom'], errors='ignore')

In [None]:
# Drop rows where 'telephone' column is empty
df = df.dropna(subset=['telephone'])

In [None]:
df['Gender'] = df['Gender'].apply(lambda x: 'Undefined' if x not in ['H', 'F'] else x)

In [None]:
pipeline=[
    {"$match": {"dernier statut": "Livre"}}, 
    {"$group" : {"_id" : "$type", "nbrventes" : {"$sum" : 1}}} 
]

In [None]:
result =list(collection.aggregate(pipeline))
print (result)

In [None]:
type_labels = [entry['_id'] for entry in result]
nbrventes_values = [entry['nbrventes'] for entry in result]

# Plot the bar chart
plt.bar(type_labels, nbrventes_values, color='green')
plt.xlabel('Type')
plt.ylabel('Number of Sales')
plt.title('Number of Sales by Type')
plt.show()