Network_Visualization_SriVamsi_Kota

### Install Python Libraries

In [1]:
#! pip install langchain
#! pip install langchain-core
#! pip install langchain-community
#! pip install google-generativeai
#! pip install gephistreamer


Collecting langchain
  Downloading langchain-0.3.23-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-core<1.0.0,>=0.3.51 (from langchain)
  Downloading langchain_core-0.3.51-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8 (from langchain)
  Downloading langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
Collecting langsmith<0.4,>=0.1.17 (from langchain)
  Downloading langsmith-0.3.28-py3-none-any.whl.metadata (15 kB)
Collecting pydantic<3.0.0,>=2.7.4 (from langchain)
  Downloading pydantic-2.11.3-py3-none-any.whl.metadata (65 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading sqlalchemy-2.0.40-cp313-cp313-win_amd64.whl.metadata (9.9 kB)
Collecting tenacity!=8.4.0,<10.0.0,>=8.1.0 (from langchain-core<1.0.0,>=0.3.51->langchain)
  Downloading tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<1.0.0,>=0.3.51->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.wh

### Create a list of Web URL for data extraction

In [1]:
# list of top Tech leaders
url_list=[  
    'https://en.wikipedia.org/wiki/Elon_Musk',
    'https://en.wikipedia.org/wiki/Mark_Zuckerberg',
    'https://en.wikipedia.org/wiki/Bill_Gates',
    'https://en.wikipedia.org/wiki/Jeff_Bezos',
    'https://en.wikipedia.org/wiki/Steve_Jobs',
    'https://en.wikipedia.org/wiki/Sam_Altman',
    'https://en.wikipedia.org/wiki/Larry_Ellison',
    'https://en.wikipedia.org/wiki/Larry_Page',
    'https://en.wikipedia.org/wiki/Sundar_Pichai',
    'https://en.wikipedia.org/wiki/Satya_Nadella'  
]


### Define function to clean data

In [2]:
# define a function to clean the extracted web URL data
import re #for regular expression 
 
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]*?>', '', text)
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s{2,}', ' ', text)
    # Trim leading and trailing whitespace
    text = text.strip()
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text


### Use Langchain framework to extract data

In [3]:
# extract the data from the URLs
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.document_loaders import WebBaseLoader
 
def extract_data_from_URL(url):
    loader=WebBaseLoader([url])
    data=loader.load().pop().page_content
    data=clean_text(data)
    documents=[Document(page_content=data)]
    # print(documents)
    splitter=RecursiveCharacterTextSplitter(chunk_size=3000,chunk_overlap=100)
    smaller_doc=splitter.split_documents(documents)
    print(len(smaller_doc))
    return smaller_doc


USER_AGENT environment variable not set, consider setting it to identify your requests.


### Use Gemini API and generate sample response

In [4]:
import google.generativeai as genai 
import os 
 
#system_instruction
system_prompt='''
Answer the question in JSON format and nothing else,Do not use code block formatting.
'''
# test connection for gemini 
genai.configure(api_key='AIzaSyAsTNGgfFFDV9w5vWUMT9wUc66eO1ls0ps')
client = genai.GenerativeModel(model_name="gemini-1.5-flash", system_instruction=system_prompt)
response = client.generate_content("Write a story about how texas can become a tech hub in the future.")
print(response.text)


{"title": "Texas Tech Surge: A 2040 Vision", "story": "By 2040, the Texas tech scene had exploded, defying all expectations.  It wasn't a single event, but a confluence of factors.  First, the state aggressively invested in STEM education, creating a pipeline of highly skilled graduates from universities across the state, not just Austin.  Smaller cities like San Antonio, El Paso, and even Lubbock saw flourishing tech incubators, supported by state grants and a renewed focus on vocational training.  Secondly, the regulatory environment shifted.  Texas streamlined its permitting processes, making it easier for startups to establish themselves and scale rapidly.  Red tape was replaced with red carpet welcomes.  Thirdly, a concerted effort was made to diversify the tech workforce.  Initiatives focused on attracting and retaining talent from diverse backgrounds, creating a vibrant and inclusive ecosystem.  This included generous scholarship programs targeted at underrepresented minorities 

# Create system prompt for the LLM to understand context of the task

In [5]:
# create system prompt to extract data in JSON format as required
 
system=""" You are a network graph maker tasked with analyzing the relationships involving top leaders in the world. Your job is to process the provided context chunk 
and extract an ontology of terms that represent key entrepreneurs, their associated entities, and all kinds of relationships present in the context.
 
**Guidelines for Extraction:**
 
1. **Identify Key Entrepreneurs and Related Terms**:
   - Extract key entrepreneurs and related concepts such as:
     - Companies, organizations, or industries they are associated with.
     - Collaborators, partners, rivals, or competitors.
     - Key innovations, achievements, or milestones.
     - Locations, events, or time periods relevant to their actions.
 
2. **Identify Relationships**:
   - Extract all types of relationships between entrepreneurs and other entities (or between entities themselves).
   - Relationships can include:
     - Professional roles or associations.
     - Business partnerships, collaborations, or rivalries.
     - Innovations or contributions to industries.
     - Personal connections or influences.
     - Historical events or shared milestones.
 
3. **Define Relationships**:
   - Clearly specify the nature of each relationship in simple and concise terms.
   - Relationships should convey meaningful connections relevant to the context.
 
**Response Format**:
- Provide your output **strictly as a list of JSON objects**. No additional text, descriptions,tags or comments are allowed.
- Each object should include the following fields:
  - `"node_1"`: The first entity in the relationship (can be a person, organization, or concept).
  - `"node_2"`: The second entity in the relationship.
  - `"edge"`: A concise sentence describing the relationship between `node_1` and `node_2`.
 
**Example Output**:
[
   {
       "node_1": "Elon Musk",
       "node_2": "SpaceX",
       "edge": "Elon Musk founded SpaceX to revolutionize space exploration."
   },
   {
       "node_1": "Steve Jobs",
       "node_2": "Apple Inc.",
       "edge": "Steve Jobs co-founded Apple Inc., a leading tech company."
   },
   {
       "node_1": "Mark Zuckerberg",
       "node_2": "Sheryl Sandberg",
       "edge": "Sheryl Sandberg worked closely with Mark Zuckerberg as COO of Facebook."
   },
   {
       "node_1": "Jeff Bezos",
       "node_2": "Blue Origin",
       "edge": "Jeff Bezos founded Blue Origin to focus on space exploration."
   }
]
 
**Important Note**:
- Always respond exclusively in JSON format. Any deviation from the JSON structure or inclusion of additional text will not be accepted.
- Do not use code block formatting like ` ``` `.
- Output must be a valid JSON array of objects without any surrounding text.
 
Please provide the context containing information about entrepreneurs and their relationships for analysis.

"""


# Use LLM’s to extract data

In [6]:
from datetime import datetime
from itertools import cycle

results = []
models = [
    'gemini-1.5-pro',
    'gemini-1.5-flash',
    'gemini-1.5-flash-8b',
]
model_cycle = cycle(models)
model_name = next(model_cycle)
start_time = datetime.now()

for url in url_list:
    smaller_doc = extract_data_from_URL(url)
    
    for doc in smaller_doc[:30]:
        while True:
            try:
                client = genai.GenerativeModel(
                    model_name=model_name,
                    system_instruction=system,
                    generation_config={"response_mime_type": 'application/json'}
                )
                chat_completion = client.generate_content(doc.page_content)
                results.append(chat_completion.candidates[0].content.parts[0].text)
                break  # Success! Break out of the retry loop.
            except Exception as e:
                errordata = e.args[0]
                print(e.args)
                
                if 'quota' in errordata or 'exceeded' in errordata or 'limit' in errordata:
                    print(f'Rate limit hit for model: {model_name}')
                    model_name = next(model_cycle)
                    print(f'Switching to next model: {model_name}')
                else:
                    print(f"Unhandled error: {e}")
                    break  # Stop retrying for non-rate-limit errors

end_time = datetime.now()
print(f'extracted information in {end_time - start_time}')
print(f'total results: {len(results)}')

56
('You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.',)
Rate limit hit for model: gemini-1.5-pro
Switching to next model: gemini-1.5-flash
32
('You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.',)
Rate limit hit for model: gemini-1.5-flash
Switching to next model: gemini-1.5-flash-8b
('You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.',)
Rate limit hit for model: gemini-1.5-flash-8b
Switching to next model: gemini-1.5-pro
('You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.',)
Rate limit hit for model:

# Save the result to JSON file

In [7]:
import json 
#print(results)
combined_nodes_and_edges=[]
for res in results:
    try:
        combined_nodes_and_edges.extend(json.loads(res)) #convert the string result from LLM to JSON 
    except Exception as e:
        print('buggy JSON object', e)
 
with open('Nodes_and_edges.json','w') as file:
    json.dump(combined_nodes_and_edges,file,indent=1)


buggy JSON object Extra data: line 1 column 142 (char 141)


# Send JSON data to Gephi

In [8]:
from gephistreamer import graph
from gephistreamer import streamer
# connect to gephi server
# create a stream 
stream = streamer.Streamer(streamer.GephiWS(hostname="localhost", port=8080, workspace="workspace1"))

In [9]:
# load the nodes and edges from the json file
with open('Nodes_and_edges.json','r') as file:
    results=json.load(file)


In [10]:
# loop through the list of json result and send to Gephi
for res in results:
    try:        
        node_a = graph.Node(res['node_1'],custom_property=1)
        node_b = graph.Node(res['node_2'],custom_property=2)
        stream.add_node(node_a,node_b)
        edge_ab = graph.Edge(node_a,node_b,custom_property=res['edge'])
        stream.add_edge(edge_ab)
 
    except Exception as e:
        print('buggy JSON object', e,res)


buggy JSON object 'edge' {'node_1': 'Elon Musk', 'node_2': 'rebranded Twitter to X in 2023'}
buggy JSON object 'node_2' {'node_1': 'Larry Ellison', 'node': 'Mark Hurd', 'edge': 'Larry Ellison appointed Mark Hurd as CEO of Oracle'}
