#ALLMS:

In [2]:
!pip install deep-translator allms pymongo dnspython langdetect openai anthropic


Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Collecting allms
  Downloading allms-1.0.12-py3-none-any.whl.metadata (10 kB)
Collecting pymongo
  Downloading pymongo-4.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting openai
  Downloading openai-1.76.2-py3-none-any.whl.metadata (25 kB)
Collecting anthropic
  Downloading anthropic-0.50.0-py3-none-any.whl.metadata (25 kB)
Collecting google-cloud-aiplatform==1.85.0 (from allms)
  Downloading google_clou

In [9]:
!pip install --upgrade allms  # Ensure the latest version is installed



In [4]:
!pip install fasttext streamlit praw requests


Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting streamlit
  Downloading streamlit-1.45.0-py3-none-any.whl.metadata (8.9 kB)
Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Collecting blinker<2,>=1.5.0 (from streamlit)
  Downloading blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━

In [5]:
import json

def store_to_db(data):
    with open("data.json", "a") as f:
        f.write(json.dumps(data) + "\\n")


In [20]:
import asyncio
import re
import time
from datetime import datetime
from langdetect import detect
from deep_translator import GoogleTranslator
import allms
 # Import Allms directly from allms
from pymongo import MongoClient
import requests

In [38]:
#  Step 1: Multilingual keyword mapping
base_terms = ["Bitcoin", "crypto", "$BTC"]
languages = ["en", "es", "zh", "hi", "ar", "fr", "de", "ja", "ru", "ko", "pt", "tr"]

def get_translated_keywords():
    keyword_map = {}
    for lang in languages:
        keyword_map[lang] = [GoogleTranslator(source='en', target=lang).translate(term) for term in base_terms]
    return keyword_map


In [31]:
#  Step 2: Simulated ingestion from Twitter
async def fetch_twitter_data(keyword_map, queue):
    headers = {"Authorization": f"Bearer {TWITTER_BEARER_TOKEN}"}
    for lang in keyword_map:
        for term in keyword_map[lang]:
            response = requests.get("https://api.twitter.com/2/tweets/search/recent",
                                    params={"query": term, "max_results": 10},
                                    headers=headers)
            if response.status_code == 200:
                tweets = response.json().get("data", [])
                for tweet in tweets:
                    await queue.put({
                        "source": "twitter",
                        "language_query": lang,
                        "text": tweet["text"],
                        "timestamp": datetime.utcnow().isoformat(),
                        "term": term
                    })


In [32]:

#  Step 3: Language detection, translation, and preprocessing
def preprocess_message(msg):
    try:
        detected_lang = detect(msg["text"])
    except:
        detected_lang = msg.get("language_query", "unknown")

    if detected_lang != "en":
        translated = GoogleTranslator(source=detected_lang, target="en").translate(msg["text"])
    else:
        translated = msg["text"]

    clean_text = re.sub(r"http\S+|[^\w\s$#@]", "", translated)
    return {
        **msg,
        "lang_detected": detected_lang,
        "text_translated": clean_text
    }


In [33]:
#  Step 4: Bot/spam detection (basic keywords)
def is_bot(tweet_text):
    spam_keywords = ["giveaway", "free btc", "airdrop"]
    return any(word in tweet_text.lower() for word in spam_keywords)

In [34]:
#  Step 5: Geo-location estimation from language
def detect_geo_from_language(lang):
    language_country_map = {
        "ja": "Japan",
        "ru": "Russia",
        "zh": "China",
        "es": "Spain/Mexico",
        "hi": "India",
        "ar": "Middle East",
    }
    return language_country_map.get(lang, "Unknown")

In [36]:
# Step 6: Sentiment analysis using allms
def analyze_sentiment(data):
    input_text = data["text_translated"]
    result = classify_text(
        input_text,
        task="sentiment",
        models=["openai:gpt-4", "anthropic:claude-3", "hf:mistral"]
    )
    data["sentiment"] = result["output"]
    return data


In [None]:
# Step 7: Topic extraction placeholder
def extract_topics(text):
    return ["price", "regulation"]  # Replace with BERTopic later

In [37]:
# Step 8: Store data in MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client.bitcoin_data
collection = db.ingested

def store_to_db(data):
    collection.insert_one(data)

In [None]:
#  Step 9: Async pipeline manager
async def pipeline():
    keyword_map = get_translated_keywords()
    queue = asyncio.Queue()

    # Launch ingestion task
    asyncio.create_task(fetch_twitter_data(keyword_map, queue))

    while True:
        raw = await queue.get()
        processed = preprocess_message(raw)
        if not is_bot(processed["text_translated"]):
            processed["geo_location"] = detect_geo_from_language(processed["lang_detected"])
            enriched = analyze_sentiment(processed)
            enriched["topics"] = extract_topics(enriched["text_translated"])
            store_to_db(enriched)
        queue.task_done()
        await asyncio.sleep(0.5)