<a href="https://colab.research.google.com/github/yasaswiyash18/BIG_DATA_ANALYSIS/blob/main/CodTech_Task_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, desc, when
import urllib.request
import pandas as pd

In [None]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("E-commerce Analysis") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

In [None]:
# Download and prepare dataset
def get_ecommerce_data():
    url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
    urllib.request.urlretrieve(url, "telco_data.csv")

    # Read CSV file using pandas first (for easier handling)
    pdf = pd.read_csv("telco_data.csv")

    # Convert to Spark DataFrame
    df = spark.createDataFrame(pdf)
    return df

In [None]:
def analyze_customer_data(df):
    print("=== Telco Customer Analysis ===")

    # Clean monthly charges column
    df = df.withColumn("MonthlyCharges", col("MonthlyCharges").cast("double"))

    # 1. Customer churn analysis
    churn_analysis = df.groupBy("Churn").agg(
        count("*").alias("customer_count"),
        avg("MonthlyCharges").alias("avg_monthly_charges")
    )

    # 2. Service adoption analysis
    service_columns = ["PhoneService", "InternetService", "OnlineSecurity", "OnlineBackup"]
    for service in service_columns:
        print(f"\n{service} Distribution:")
        df.groupBy(service).count().orderBy(desc("count")).show()

    # 3. Contract type analysis
    contract_analysis = df.groupBy("Contract").agg(
        count("*").alias("customer_count"),
        avg("MonthlyCharges").alias("avg_monthly_charges")
    )

    # 4. Monthly charges analysis
    df = df.withColumn("charges_category",
        when(col("MonthlyCharges") < 35, "Low")
        .when((col("MonthlyCharges") >= 35) & (col("MonthlyCharges") < 70), "Medium")
        .when(col("MonthlyCharges") >= 70, "High")
    )

    charges_distribution = df.groupBy("charges_category").count()

    return churn_analysis, contract_analysis, charges_distribution

In [None]:
def main():
    # Get data
    print("Downloading and preparing data...")
    df = get_ecommerce_data()

    # Perform analysis
    print("\nPerforming analysis...")
    churn_stats, contract_stats, charges_dist = analyze_customer_data(df)

    # Display results
    print("\nChurn Analysis:")
    churn_stats.show()

    print("\nContract Analysis:")
    contract_stats.show()

    print("\nCharges Distribution:")
    charges_dist.show()

    # Stop Spark session
    spark.stop()

if __name__ == "__main__":
    main()

Downloading and preparing data...

Performing analysis...
=== Telco Customer Analysis ===

PhoneService Distribution:
+------------+-----+
|PhoneService|count|
+------------+-----+
|         Yes| 6361|
|          No|  682|
+------------+-----+


InternetService Distribution:
+---------------+-----+
|InternetService|count|
+---------------+-----+
|    Fiber optic| 3096|
|            DSL| 2421|
|             No| 1526|
+---------------+-----+


OnlineSecurity Distribution:
+-------------------+-----+
|     OnlineSecurity|count|
+-------------------+-----+
|                 No| 3498|
|                Yes| 2019|
|No internet service| 1526|
+-------------------+-----+


OnlineBackup Distribution:
+-------------------+-----+
|       OnlineBackup|count|
+-------------------+-----+
|                 No| 3088|
|                Yes| 2429|
|No internet service| 1526|
+-------------------+-----+


Churn Analysis:
+-----+--------------+-------------------+
|Churn|customer_count|avg_monthly_charges|
