In [13]:
import ollama
import pandas as pd
import json
from tqdm.auto import tqdm

In [14]:
available_models = ollama.list()
for model in available_models['models']:
    print(model.model)


llama3.1:8b-instruct-q5_K_M
llama3.1:8b-instruct-q4_K_M
deepseek-r1:8b
llama3.1:8b


In [15]:
model_name = "llama3.1:8b-instruct-q4_K_M"
response = ollama.chat(
    model=model_name,
    messages=[{'role': 'user', 'content': 'Hello, respond in json format: {"response": "Hello, world!"}'}],
    format='json'
)
print(response['message']['content'])


{"response": "Hello from a language model! How can I assist you today?"}


In [16]:
df = pd.read_csv('data/news/all_articles_combined.csv')

In [17]:
locations = []

with tqdm(total=len(df)) as pbar:
    for idx, row in df.iterrows():
        text = f"{row['title']} {row['description']} {row['keywords']}"
        
        response_relevance = ollama.chat(
            model=model_name,
            messages=[{
                'role': 'user',
                'content': f'Is this text about crime in a US location? Respond in JSON format: {{"relevance": true}} or {{"relevance": false}}\n\nText: {text}'
            }],
            format='json'
        )
        
        try:
            content = response_relevance['message']['content'].strip()
            if not content:
                relevance_data = {'relevance': False}
            else:
                relevance_data = json.loads(content)
        except (json.JSONDecodeError, KeyError, TypeError):
            relevance_data = {'relevance': False}
        
        if relevance_data.get('relevance'):
            response_location = ollama.chat(
                model=model_name,
                messages=[{
                    'role': 'user',
                    'content': f'Identify the exact US **STATE** mentioned in this text. If no state is mentioned, respond with "None", if the region is subnational, respond with the two digit state code. Respond in JSON format: {{"location": "two digit state code"}}\n\nText: {text}'
                }],
                format='json'
            )
            try:
                content = response_location['message']['content'].strip()
                if not content:
                    location_data = {'location': None}
                else:
                    location_data = json.loads(content)
            except (json.JSONDecodeError, KeyError, TypeError):
                location_data = {'location': None}
            locations.append(location_data.get('location'))
        else:
            locations.append(None)
        
        pbar.update(1)

df['location'] = locations


100%|██████████| 13875/13875 [1:05:53<00:00,  3.51it/s]


In [18]:
# import pycountry

# def standardize_state(location):
#     if location is None or (isinstance(location, float) and pd.isna(location)):
#         return None
    
#     location_str = str(location).strip()
    
#     # Try to find US state by name or code
#     for subdivision in pycountry.subdivisions.get(country_code='US'):
#         if (location_str.upper() == subdivision.code.split('-')[1] or 
#             location_str.lower() == subdivision.name.lower()):
#             return subdivision.code.split('-')[1]
    
#     return None

# df['location'] = df['location'].apply(standardize_state)


In [19]:
df['location']

0          TN
1          NY
2        None
3          MN
4          IL
         ... 
13870      NC
13871      NY
13872    None
13873      ID
13874    None
Name: location, Length: 13875, dtype: object

In [20]:
df.to_csv('data/news/all_articles_combined_machine_annotation.csv', index=False)
