In [13]:
# Import des modules
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.sql.functions import col


In [17]:
from pyspark.sql import SparkSession

# Step 1: Initialize SparkSession
spark = SparkSession.builder \
    .appName("JSON File Reading") \
    .getOrCreate()

try:
    # Step 2: Read JSON file into DataFrame
    df = spark.read.json("C:\\Users\\hp\\id2\\bg\\fpo_data.json")

    # Step 3: Show DataFrame schema and sample data
    df.printSchema()
    df.show(5, truncate=False)  # Show first 5 rows without truncation

    # If you've reached this point without errors, the DataFrame was successfully created
    print("JSON file successfully read into DataFrame!")
    
except Exception as e:
    # Step 4: Error handling and troubleshooting
    print("An error occurred while reading the JSON file:")
    print(str(e))
    # Additional troubleshooting steps can be added here based on the specific error
 


root
 |-- Abstract: string (nullable = true)
 |-- Assignee: string (nullable = true)
 |-- Document Number: string (nullable = true)
 |-- Document Type: string (nullable = true)
 |-- Filing Date: string (nullable = true)
 |-- Inventor Name: string (nullable = true)
 |-- Publication Date: string (nullable = true)
 |-- Title: string (nullable = true)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [18]:
titles = df.select("Title")
titles.show(3000)

+----------------------------------------+
|                                   Title|
+----------------------------------------+
|                    Luftfahrzeugantri...|
|                    BESTIMMUNG VON KR...|
|                    BESTIMMUNG VON KR...|
|                      LEISTUNGSPARAMETER|
|                    BESTIMMUNG VON KR...|
|                    BESTIMMUNG VON KR...|
|                    BESTIMMUNG VON KR...|
|                             ABGASGEHALT|
|                    ANTRIEBSSYSTEM FÜ...|
|                      KRAFTSTOFFMERKMALE|
|                    VERFAHREN UND SYS...|
|                    FLUGZEUGNOTSTROMA...|
|                    Emergency power u...|
|                    Emergency power u...|
|                    AVIATION FUEL COM...|
|                    METHOD FOR PRODUC...|
|                    BIORENEWABLE KERO...|
|                    METHOD TO PRODUCE...|
|                    PROCESS FOR INTEG...|
|                           AVIATION FUEL|
|          

In [19]:
total_titles = titles.count()
print("Total number of titles:", total_titles)


Total number of titles: 3500


In [20]:
popular_titles = titles.groupBy("Title").count().orderBy("count", ascending=False)
popular_titles.show()  


+--------------------+-----+
|               Title|count|
+--------------------+-----+
|  PROCESSING BIOMASS|   25|
|  Processing biomass|   19|
|PROCESSING BIOMAS...|   16|
|Processing biomas...|   14|
|Environment-frien...|   10|
|Fire suppression ...|    9|
|Turbine section o...|    9|
|Additives for min...|    8|
|ELECTRICAL POWER ...|    8|
|INTEGRATED POWER ...|    8|
|HIGH SPEED MULTI-...|    7|
|Systems and metho...|    7|
|ROTARY ENERGY CON...|    7|
|GASIFICATION PROCESS|    7|
|FIRE SUPPRESSION ...|    7|
|ASSOCIATIVE POLYM...|    7|
|AERO COMPRESSION ...|    6|
|        COMPOSITIONS|    6|
|MEASURING WEIGHT ...|    6|
|BIORENEWABLE KERO...|    6|
+--------------------+-----+
only showing top 20 rows



In [21]:
from pyspark.sql.functions import avg, length

avg_title_length = titles.select(avg(length("Title")).alias("avg_title_length")).collect()[0]["avg_title_length"]
print("Average title length:", avg_title_length)


Average title length: 59.68076593312375


In [22]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col
from pyspark.sql import Row



In [3]:
import pandas as pd
df = pd.read_json("C:\\Users\\hp\\id2\\bg\\fpo_data.json")
titles_abstracts_df = df[["Title", "Abstract"]]
titles_abstracts_df.head()


Unnamed: 0,Title,Abstract
0,Luftfahrzeugantriebssystem,Ein Antriebssystem (4) für ein Luftfahrzeug (1...
1,BESTIMMUNG VON KRAFTSTOFFMERKMALEN,Die vorliegende Anmeldung offenbart ein Verfah...
2,BESTIMMUNG VON KRAFTSTOFFMERKMALEN,Die vorliegende Anmeldung offenbart ein Verfah...
3,LEISTUNGSPARAMETER,Die vorliegende Anmeldung offenbart ein Verfah...
4,BESTIMMUNG VON KRAFTSTOFFMERKMALEN,Ein Verfahren (1040) zur Bestimmung eines oder...


In [4]:
titles_abstracts_df.shape

(3500, 2)