# Load_from_Silver_to_Gold
##### En la capa Gold, los datos suelen estar optimizados para análisis específicos, modelos de negocio o dashboards, presentándose en una forma altamente agregada, segmentada o modelada según los requisitos del usuario final. Este paso suele incluir cálculos avanzados, agregaciones y enriquecimiento de información.







.


In [1]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from awsglue.context import GlueContext
from awsglue.utils import getResolvedOptions
from pyspark.sql.types import StringType
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
import re
from awsglue.job import Job
from pyspark.context import SparkContext
import boto3

# Obtener los argumentos del trabajo
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

# Crear contexto de Glue
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session


# Crear el objeto Job
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Trying to create a Glue session for the kernel.
Session Type: glueetl
Session ID: 973d96b3-247e-4b9e-a5d2-e4477f81d39b
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
Waiting for session 973d96b3-247e-4b9e-a5d2-e4477f81d39b to get into ready status...
Session 973d96b3-247e-4b9e-a5d2-e4477f81d39b has been created.
ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=GlueReplApp, master=jes) created by __init__ at /tmp/32870217093724463:514 


In [2]:
# Ruta del archivo Parquet en S3

parquet_path = "s3://spaceflightbuckes2025/Silver /cleaned_data/part-*.parquet"
# Lectura del archivo Parquet con esquema inferido
data_cleaned = spark.read.parquet(parquet_path)




In [4]:
# -------------------------------
# Paso 1: Análisis de Contenido
# -------------------------------

# Lista de stopwords básicas (puedes ampliar esta lista si lo deseas)
stop_words = set([
    "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", 
    "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", 
    "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "can't", 
    "did", "didn't", "do", "does", "doesn't", "don't", "doing", "don't", "down", 
    "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", 
    "haven't", "having", "he", "he's", "her", "here", "here's", "hers", "herself", "he's", 
    "how", "how's", "however", "i", "i'm", "i've", "i'll", "i'd", "i'll", "i'm", "if", "is", 
    "isn't", "it's", "let", "me", "more", "my", "myself", "no", "nor", "not", "of", "on", 
    "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", 
    "own", "same", "she", "she's", "should", "shouldn't", "so", "some", "such", "than", 
    "that", "that's", "that'll", "that'd", "that's", "the", "that's", "theirs", "them", 
    "they", "they're", "they've", "they'd", "this", "this's", "those", "through", "to", 
    "too", "under", "until", "up", "very", "was", "wasn't", "we", "we're", "we've", 
    "we'll", "we'd", "we're", "what", "what's", "what's", "when", "when's", "when", 
    "where", "where's", "where", "while", "who", "who's", "whom", "who's", "why", "why's"
])

# Función para extraer palabras clave
def extract_keywords(text):
    if not text:
        return None
    text = re.sub(r"[^\w\s]", "", text.lower())  # Limpiar texto
    tokens = text.split()  # Tokenización básica
    keywords = [word for word in tokens if word not in stop_words]  # Filtrar stopwords
    return ",".join(keywords)

# UDF para Spark
extract_keywords_udf = udf(extract_keywords, StringType())

# Aplicar UDF para palabras clave
cleaned_data = data_cleaned.withColumn("keywords", extract_keywords_udf(col("summary")))




In [5]:
# -------------------------------
# Paso 2: Análisis de Sentimiento usando AWS Comprehend
# -------------------------------

import boto3
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType

# Función para obtener análisis de sentimiento y agregar la puntuación numérica
def detect_sentiment(text):
    if text:
        # Crear el cliente de AWS Comprehend dentro de la función
        comprehend_client = boto3.client('comprehend', region_name='us-east-1')  # Ajusta la región si es necesario
        
        response = comprehend_client.batch_detect_sentiment(
            TextList=[text],
            LanguageCode='en'  # O 'es' si los textos están en español
        )
        
        sentiment = response['ResultList'][0]['Sentiment']
        
        # Asignar puntuación numérica basada en el sentimiento
        if sentiment == "POSITIVE":
            score = 1
        elif sentiment == "NEGATIVE":
            score = -1
        elif sentiment == "NEUTRAL":
            score = 0
        else:
            score = 2  # Para el sentimiento MIXED, asignamos 2 (puedes cambiarlo si lo prefieres)
        
        return score
    
    return None

# Registrar UDF para análisis de sentimiento
detect_sentiment_udf = udf(detect_sentiment, IntegerType())

# Aplicar análisis de sentimiento y agregar la columna con puntuación numérica
cleaned_data = cleaned_data.withColumn("sentiment_score", detect_sentiment_udf(col("summary")))
          







In [7]:
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType
# -------------------------------
# Paso 2:Modelo de clasificacion
# -------------------------------

classification_dict = {
    # Términos astronómicos
    "Space": "Astronomical terminology",
    "Galaxy": "Astronomical terminology",
    "Universe": "Astronomical terminology",
    "Orbit": "Astronomical terminology",
    "Star": "Astronomical terminology",
    "Constellation": "Astronomical terminology",
    "Planet": "Astronomical terminology",
    "Satellite": "Astronomical terminology, Technology and science",
    "Asteroid": "Astronomical terminology",
    "Comet": "Astronomical terminology",
    "Black hole": "Astronomical terminology",
    "Nebula": "Astronomical terminology",
    "Meteorite": "Astronomical terminology",
    "Gravity": "Astronomical terminology, Physics",
    "Microgravity": "Astronomical terminology, Physics",
    "Exoplanet": "Astronomical terminology",
    
    # Términos relacionados con vehículos espaciales
    "Rocket": "Spacecraft and vehicles",
    "Shuttle": "Spacecraft and vehicles",
    "Spaceship": "Spacecraft and vehicles",
    "Probe": "Spacecraft and vehicles",
    "Space station": "Spacecraft and vehicles",
    "Lunar module": "Spacecraft and vehicles",
    "Rover": "Spacecraft and vehicles",
    "Thruster": "Spacecraft and vehicles",
    "Vehicle": "Spacecraft and vehicles",
    "Capsule": "Spacecraft and vehicles",
    "Falcon": "Spacecraft and vehicles",
    "Artemis": "Spacecraft and vehicles",
    "Orion": "Spacecraft and vehicles",
    
    # Términos de misiones y exploración
    "Exploration": "Missions and exploration",
    "Colonization": "Missions and exploration",
    "Mission": "Missions and exploration",
    "Liftoff": "Missions and exploration",
    "Landing": "Missions and exploration",
    "Launch": "Missions and exploration",
    "Docking": "Missions and exploration",
    "Mars": "Missions and exploration",
    "Moon": "Missions and exploration",
    "Journey": "Missions and exploration",
    "Discovery": "Missions and exploration",
    "Research": "Missions and exploration",
    
    # Términos relacionados con tecnología y ciencia
    "Propulsion": "Technology and science",
    "Fuel": "Technology and science",
    "Solar panel": "Technology and science",
    "Artificial intelligence": "Technology and science",
    "Robotics": "Technology and science",
    "Innovation": "Technology and science",
    "Engineering": "Technology and science",
    "Physics": "Technology and science",
    "Astronomy": "Technology and science",
    "Science": "Technology and science",
    "Instruments": "Technology and science",
    "Telescope": "Technology and science",
    "Sensors": "Technology and science",
    "Simulation": "Technology and science",
    
    # Organizaciones y programas espaciales
    "NASA": "Organizations and space programs",
    "ESA (European Space Agency)": "Organizations and space programs",
    "SpaceX": "Organizations and space programs",
    "Blue Origin": "Organizations and space programs",
    "Roscosmos": "Organizations and space programs",
    "ISRO (Indian Space Research Organization)": "Organizations and space programs",
    "CNSA (China National Space Administration)": "Organizations and space programs",
    "JAXA (Japan Aerospace Exploration Agency)": "Organizations and space programs",
    "Virgin Galactic": "Organizations and space programs",
    
    # Personas y profesiones
    "Astronaut": "People and professions",
    "Cosmonaut": "People and professions",
    "Scientist": "People and professions",
    "Engineer": "People and professions",
    "Pilot": "People and professions",
    "Explorer": "People and professions",
    "Researcher": "People and professions",
    "Visionary": "People and professions",
    
    # Conceptos futuristas y de ciencia ficción
    "Martian colonization": "Futuristic concepts and sci-fi",
    "Extraterrestrial life": "Futuristic concepts and sci-fi",
    "Terraforming": "Futuristic concepts and sci-fi",
    "Hyperspace": "Futuristic concepts and sci-fi",
    "Interstellar travel": "Futuristic concepts and sci-fi",
    "Space tourism": "Futuristic concepts and sci-fi",
    "Space habitats": "Futuristic concepts and sci-fi",
    "Warp drive": "Futuristic concepts and sci-fi",
    "Space elevator": "Futuristic concepts and sci-fi",
    "Wormhole": "Futuristic concepts and sci-fi",  # Añadido nuevo término
    
    # Nuevos términos adicionales
    "Quantum mechanics": "Technology and science",  # Añadido nuevo término
    "Artificial gravity": "Technology and science",  # Añadido nuevo término
}


# Función para clasificar el título
def classify_title(title):
    for keyword, category in classification_dict.items():
        if keyword.lower() in title.lower():  # Compara en minúsculas para no diferenciar por mayúsculas
            return category
    return "Unclassified"  # Si no se encuentra ninguna palabra clave

# Convertimos la función en una UDF (User Defined Function)
classify_udf = udf(classify_title, StringType())

# Inicialización de Spark
sqlContext = SQLContext(sc)

Final_data = cleaned_data.withColumn("classification", classify_udf(col("title")))

# Aplicamos la clasificación
#data_with_TopicClasificacion = data_with_TopicClasificacion.withColumn("classification", classify_udf(col("title")))





In [9]:
# Guardar los resultados de source_analysis en S3
source_analysis_output_path = "s3://spaceflightbuckes2025/Gold /Final_Data/"
Final_data.write.mode("overwrite").parquet(source_analysis_output_path)






In [10]:
# Función para extraer palabras clave
def extract_keywords(text):
    if not text:
        return None
    text = re.sub(r"[^\w\s]", "", text.lower())  # Limpiar texto
    tokens = text.split()  # Tokenización básica
    keywords = [word for word in tokens if word not in stop_words]  # Filtrar stopwords
    return ",".join(keywords)

# UDF para Spark
extract_keywords_udf = udf(extract_keywords, StringType())

# Aplicar UDF para palabras clave
analyzed_data = Final_data.withColumn("keywords", extract_keywords_udf(col("summary")))





In [12]:
# -------------------------------
# Análisis de Tendencias
# -------------------------------

# Conteo de artículos por fuente de noticias
source_analysis = analyzed_data.groupBy("news_site") \
    .agg(count("*").alias("article_count")) \
    .orderBy(desc("article_count"))

# Guardar los resultados de source_analysis en S3
source_analysis_output_path = "s3://spaceflightbuckes2025/Gold /CountArticulesNews/"
source_analysis.write.mode("overwrite").parquet(source_analysis_output_path)

# Tendencias de temas por tiempo
trends_analysis = analyzed_data.groupBy("published_at") \
    .agg(collect_list("keywords").alias("keywords_list"))

# Guardar los resultados de trends_analysis en S3
trends_analysis_output_path = "s3://spaceflightbuckes2025/Gold /trends_analysis/"
trends_analysis.write.mode("overwrite").parquet(trends_analysis_output_path)

# -------------------------------
# Optimizaciones Requeridas
# -------------------------------

# Particionar datos históricos por año y mes
partitioned_data = analyzed_data.withColumn("year", year("published_at")) \
    .withColumn("month", month("published_at"))

# Escribir datos particionados en S3
output_path = "s3://spaceflightbuckes2025/Gold /partitioned_data/"
partitioned_data.write.partitionBy("year", "month").mode("overwrite").parquet(output_path)

# -------------------------------
# Caching de resultados frecuentes
# -------------------------------

# Cache de resultados más consultados (ejemplo: fuentes activas)
source_analysis.cache()


DataFrame[news_site: string, article_count: bigint]


In [13]:
job.commit()

NameError: name 'job' is not defined
