In [1]:
import boto3
import json
import pandas as pd
import io

# Configurar o cliente S3
s3 = boto3.client('s3')

# Nome do bucket S3
bucket_name = 'call-alien-vault-tcc-dev-main-pulses'
directory_prefix = 'subscribed_wanna_cry/'  # O prefixo do diretório específico

# Listar objetos no diretório especifico do bucket
objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=directory_prefix)


# Inicializar um DataFrame pandas vazio para armazenar os dados
df_combined = pd.DataFrame()

# Iterar sobre os objetos
for obj in objects.get('Contents', []):
    key = obj['Key']
    
    # Verificar se o objeto é um arquivo JSON (opcional)
    if key.endswith('.json'):
        # Ler o arquivo JSON do S3
        response = s3.get_object(Bucket=bucket_name, Key=key)
        json_content = response['Body'].read().decode('utf-8')
        
        # Analisar o conteúdo JSON e criar um DataFrame
        try:
            data = json.loads(json_content)
            df = pd.DataFrame(data)
            df_combined = pd.concat([df_combined, df], ignore_index=True)
        except json.JSONDecodeError as e:
            print(f"Erro ao analisar o arquivo {key}: {str(e)}")
            
print(df_combined)

# Converter o DataFrame combinado em um arquivo JSON
combined_json_data = df_combined.to_json(orient='records')

# Salvar o arquivo JSON combinado
with open('combined_subscribed_wanna_cry.json', 'w', encoding='utf-8') as json_file:
    json_file.write(combined_json_data)



                                               previous  prefetch_pulse_ids  \
0     https://otx.alienvault.com/api/v1/pulses/subsc...               False   
1     https://otx.alienvault.com/api/v1/pulses/subsc...               False   
2     https://otx.alienvault.com/api/v1/pulses/subsc...               False   
3     https://otx.alienvault.com/api/v1/pulses/subsc...               False   
4     https://otx.alienvault.com/api/v1/pulses/subsc...               False   
...                                                 ...                 ...   
4990  https://otx.alienvault.com/api/v1/pulses/subsc...               False   
4991  https://otx.alienvault.com/api/v1/pulses/subsc...               False   
4992  https://otx.alienvault.com/api/v1/pulses/subsc...               False   
4993  https://otx.alienvault.com/api/v1/pulses/subsc...               False   
4994  https://otx.alienvault.com/api/v1/pulses/subsc...               False   

            t3   count  t                          

In [2]:
df_wanna_cry = pd.read_json('combined_subscribed_wanna_cry.json')
df_wanna_cry

Unnamed: 0,previous,prefetch_pulse_ids,t3,count,t,next,results,id,t2
0,https://otx.alienvault.com/api/v1/pulses/subsc...,False,1.729972,134456,0,https://otx.alienvault.com/api/v1/pulses/subsc...,"{'author_name': 'CyberHunterAutoFeed', 'malwar...",e1a63493-a08d-4151-8f79-ce7da1afb335,0.171472
1,https://otx.alienvault.com/api/v1/pulses/subsc...,False,1.729972,134456,0,https://otx.alienvault.com/api/v1/pulses/subsc...,"{'author_name': 'CyberHunterAutoFeed', 'malwar...",e1a63493-a08d-4151-8f79-ce7da1afb335,0.171472
2,https://otx.alienvault.com/api/v1/pulses/subsc...,False,1.729972,134456,0,https://otx.alienvault.com/api/v1/pulses/subsc...,"{'author_name': 'CyberHunterAutoFeed', 'malwar...",e1a63493-a08d-4151-8f79-ce7da1afb335,0.171472
3,https://otx.alienvault.com/api/v1/pulses/subsc...,False,1.729972,134456,0,https://otx.alienvault.com/api/v1/pulses/subsc...,"{'author_name': 'CyberHunterAutoFeed', 'malwar...",e1a63493-a08d-4151-8f79-ce7da1afb335,0.171472
4,https://otx.alienvault.com/api/v1/pulses/subsc...,False,1.729972,134456,0,https://otx.alienvault.com/api/v1/pulses/subsc...,"{'author_name': 'CyberHunterAutoFeed', 'malwar...",e1a63493-a08d-4151-8f79-ce7da1afb335,0.171472
...,...,...,...,...,...,...,...,...,...
4990,https://otx.alienvault.com/api/v1/pulses/subsc...,False,2.223125,134389,0,https://otx.alienvault.com/api/v1/pulses/subsc...,"{'author_name': 'jnazario', 'malware_families'...",dc5f7fae-2043-4fbb-ba2e-d985f38bae69,0.053657
4991,https://otx.alienvault.com/api/v1/pulses/subsc...,False,2.223125,134389,0,https://otx.alienvault.com/api/v1/pulses/subsc...,"{'author_name': 'jnazario', 'malware_families'...",dc5f7fae-2043-4fbb-ba2e-d985f38bae69,0.053657
4992,https://otx.alienvault.com/api/v1/pulses/subsc...,False,2.223125,134389,0,https://otx.alienvault.com/api/v1/pulses/subsc...,"{'author_name': 'jnazario', 'malware_families'...",dc5f7fae-2043-4fbb-ba2e-d985f38bae69,0.053657
4993,https://otx.alienvault.com/api/v1/pulses/subsc...,False,2.223125,134389,0,https://otx.alienvault.com/api/v1/pulses/subsc...,"{'author_name': 'jnazario', 'malware_families'...",dc5f7fae-2043-4fbb-ba2e-d985f38bae69,0.053657


In [3]:
results_wanna_cry = df_wanna_cry['results']

In [4]:
results_wanna_cry = df_wanna_cry['results'].reset_index(drop=True)

In [5]:
from pandas import json_normalize
df_json = json_normalize(df_wanna_cry['results'])

In [6]:
df_json

df_json['malware_families'] = df_json['malware_families'].apply(lambda x: ['WannaCry'] if not x else x)

In [7]:
df_json

Unnamed: 0,author_name,malware_families,more_indicators,references,targeted_countries,created,description,indicators,extract_source,adversary,revision,tags,public,industries,name,modified,tlp,attack_ids,id
0,CyberHunterAutoFeed,[WannaCry],False,[https://twitter.com/TrackerC2Bot/status/16701...,[],2023-06-17T23:32:43.350000,,[],[],,1,[malware],1,[],Twitter Feed - TrackerC2Bot - 17-06-2023,2023-07-17T23:00:00.066000,green,[],648e429b3f9f53ead458aed6
1,CyberHunterAutoFeed,[WannaCry],False,[https://twitter.com/balkanssec/status/1670095...,[],2023-06-17T23:32:40.404000,,[],[],,1,[malware],1,[],Twitter Feed - balkanssec - 17-06-2023,2023-07-17T23:00:00.066000,green,[],648e4298b1d2a6e9172a7faf
2,CyberHunterAutoFeed,[WannaCry],False,[https://twitter.com/Jane_0sint/status/1670048...,[],2023-06-17T23:32:37.232000,,[],[],,1,[],1,[],Twitter Feed - Jane_0sint - 17-06-2023,2023-07-17T23:00:00.066000,green,[],648e4295a23d05aba14aa967
3,CyberHunterAutoFeed,[WannaCry],False,[https://twitter.com/0xToxin/status/1670042835...,[],2023-06-17T23:32:36.465000,,"[{'indicator': 'promotores14.duckdns.org', 'is...",[],,1,[AsyncRAT],1,[],Twitter Feed - 0xToxin - 17-06-2023,2023-07-17T23:00:00.066000,green,[],648e429465b2a2af5a63a68b
4,CyberHunterAutoFeed,[WannaCry],False,[https://twitter.com/malwrhunterteam/status/16...,[],2023-06-17T23:32:34.901000,,[{'indicator': 'qe6evcafs0.execute-api.us-east...,[],,1,[],1,[],Twitter Feed - malwrhunterteam - 17-06-2023,2023-07-17T23:00:00.066000,green,[],648e4292d5509e223192b47a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4990,jnazario,[WannaCry],False,[],[],2023-07-20T14:25:49.086000,RDP honeypot authentication attempts from a US...,[],[],,1,"[RDP, honeypot]",1,[],RDP honeypot logs for 2023/07/20,2023-08-19T14:00:06.716000,green,[],64b943edbf9d2bcb6e0c812d
4991,jnazario,[WannaCry],False,[],[],2023-07-20T14:25:49.734000,SSH honeypot logs for brute force attackers fr...,[{'indicator': 'f57fb0feafebe84525278fe2d083cd...,[],,1,"[SSH, bruteforce, honeypot]",1,[],SSH honeypot logs for 2023-07-20,2023-08-19T14:00:06.716000,green,[],64b943ed7423fbaac1254216
4992,jnazario,[WannaCry],False,[],[],2023-07-20T14:25:48.540000,PostgresQL honeypot authentication attempts fr...,[],[],,1,"[postgres, honeypot]",1,[],PostgresQL honeypot logs for 2023-07-20,2023-08-19T14:00:06.716000,green,[],64b943ec819cc6263e85b892
4993,jnazario,[WannaCry],False,[],[],2023-07-20T14:25:47.946000,VNC honeypot authentication attempts from a US...,[],[],,1,"[vnc, honeypot]",1,[],VNC honeypot logs for 2023/07/20,2023-08-19T14:00:06.716000,green,[],64b943ebb97fe2528cac3fe2


In [8]:
json_teste = df_json.to_json(orient='records')

# Salvar o arquivo JSON combinado
with open('teste_combined_subscribed_wanna_cry.json', 'w', encoding='utf-8') as json_file:
    json_file.write(json_teste)
    
    
teste = pd.read_json('teste_combined_subscribed_wanna_cry.json')

In [9]:
teste

Unnamed: 0,author_name,malware_families,more_indicators,references,targeted_countries,created,description,indicators,extract_source,adversary,revision,tags,public,industries,name,modified,tlp,attack_ids,id
0,CyberHunterAutoFeed,[WannaCry],False,[https://twitter.com/TrackerC2Bot/status/16701...,[],2023-06-17T23:32:43.350000,,[],[],,1,[malware],1,[],Twitter Feed - TrackerC2Bot - 17-06-2023,2023-07-17 23:00:00.066,green,[],648e429b3f9f53ead458aed6
1,CyberHunterAutoFeed,[WannaCry],False,[https://twitter.com/balkanssec/status/1670095...,[],2023-06-17T23:32:40.404000,,[],[],,1,[malware],1,[],Twitter Feed - balkanssec - 17-06-2023,2023-07-17 23:00:00.066,green,[],648e4298b1d2a6e9172a7faf
2,CyberHunterAutoFeed,[WannaCry],False,[https://twitter.com/Jane_0sint/status/1670048...,[],2023-06-17T23:32:37.232000,,[],[],,1,[],1,[],Twitter Feed - Jane_0sint - 17-06-2023,2023-07-17 23:00:00.066,green,[],648e4295a23d05aba14aa967
3,CyberHunterAutoFeed,[WannaCry],False,[https://twitter.com/0xToxin/status/1670042835...,[],2023-06-17T23:32:36.465000,,"[{'indicator': 'promotores14.duckdns.org', 'is...",[],,1,[AsyncRAT],1,[],Twitter Feed - 0xToxin - 17-06-2023,2023-07-17 23:00:00.066,green,[],648e429465b2a2af5a63a68b
4,CyberHunterAutoFeed,[WannaCry],False,[https://twitter.com/malwrhunterteam/status/16...,[],2023-06-17T23:32:34.901000,,[{'indicator': 'qe6evcafs0.execute-api.us-east...,[],,1,[],1,[],Twitter Feed - malwrhunterteam - 17-06-2023,2023-07-17 23:00:00.066,green,[],648e4292d5509e223192b47a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4990,jnazario,[WannaCry],False,[],[],2023-07-20T14:25:49.086000,RDP honeypot authentication attempts from a US...,[],[],,1,"[RDP, honeypot]",1,[],RDP honeypot logs for 2023/07/20,2023-08-19 14:00:06.716,green,[],64b943edbf9d2bcb6e0c812d
4991,jnazario,[WannaCry],False,[],[],2023-07-20T14:25:49.734000,SSH honeypot logs for brute force attackers fr...,[{'indicator': 'f57fb0feafebe84525278fe2d083cd...,[],,1,"[SSH, bruteforce, honeypot]",1,[],SSH honeypot logs for 2023-07-20,2023-08-19 14:00:06.716,green,[],64b943ed7423fbaac1254216
4992,jnazario,[WannaCry],False,[],[],2023-07-20T14:25:48.540000,PostgresQL honeypot authentication attempts fr...,[],[],,1,"[postgres, honeypot]",1,[],PostgresQL honeypot logs for 2023-07-20,2023-08-19 14:00:06.716,green,[],64b943ec819cc6263e85b892
4993,jnazario,[WannaCry],False,[],[],2023-07-20T14:25:47.946000,VNC honeypot authentication attempts from a US...,[],[],,1,"[vnc, honeypot]",1,[],VNC honeypot logs for 2023/07/20,2023-08-19 14:00:06.716,green,[],64b943ebb97fe2528cac3fe2
