# NFT Authenticity and Prediction using Sentiment Analysis and Deep Learning

## Imports

In [None]:
import os
import json
import asyncio
import tqdm
import tqdm.asyncio
import nest_asyncio
import tweepy
import time
import calendar
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn import svm, metrics, preprocessing
from sklearn.model_selection import cross_val_score, train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from dotenv import load_dotenv
from pprint import pprint
from datetime import datetime
from dateutil.relativedelta import relativedelta
from aiohttp import ClientSession, TCPConnector
from matplotlib import pyplot
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
nest_asyncio.apply()
load_dotenv()

## Data Collection

Get Project List Data

In [None]:
df = pd.read_csv("collections_2.csv")
sample_slugs = df['slug'].values
slug_date = {}
slug_price = {}

### OpenSea Metrics from Collection Slugs

Asynchronous Data Collection

In [None]:
async def fetchData(url,session):
    '''
    This the main task function which takes a URL and an Async Session
    and fetches the data from the URL. Since we knew the format of a valid
    response, we check if the response was valid and if it isn't, then we 
    try the request again in another 5 seconds until we get a valid response
    '''
    async with session.get(url) as response:
        try:
            response = await response.read()
            assert 'detail' not in json.loads(response.decode('UTF-8'))
            return response
        except:
            await asyncio.sleep(5)
            return await fetchData(url,session)

async def transactionScrape():
    '''
    API KEY REQUIRED
    ---------------------
    Utilizes asyncio and aiohttp to make asynchronous requests utilizing the delegated
    task function seen above. In this particular use case, we take the creation dates for
    projects obtained from the collection scrape function below and increment daily while 
    fetching transaction data for that time period. As a result, we get 2 weeks worth of 
    transactions that begin when the project is made public. We parse these transactions to
    get the total volume of ETH traded and the number of transaction made.
    '''
    tasks = []
    responses = []
    connector = TCPConnector(limit_per_host=1)
    url = "https://api.opensea.io/api/v1/events?collection_slug={slug}&only_opensea=false&event_type=successful&limit=300&occurred_after={start}&occurred_before={end}"
    headers={"Accept": "application/json", "X-API-KEY": os.getenv('OPENSEA_API_KEY')}
    async with ClientSession(connector=connector, headers=headers) as session:
        for i in range(len(sample_slugs)):
            start_date = datetime.fromisoformat(slug_date[sample_slugs[i]].split('T')[0]) - relativedelta(days=1)
            for j in range(14):
                start_date = start_date + relativedelta(days=1)
                end_date = start_date + relativedelta(days=1) 
                task = asyncio.ensure_future(fetchData(url.format(slug=sample_slugs[i],start=start_date.timestamp(), end=end_date.timestamp()),session))
                tasks.append(task)
        for f in tqdm.tqdm(asyncio.as_completed(tasks), total=len(tasks)):
            responses.append(await f)
    for response in responses:
        response = json.loads(response.decode('utf8'))
        try:
            collection_slug = response['asset_events'][0]['collection_slug'] 
            for event in response['asset_events']:
                slug_price[collection_slug]['total_volume'] += float(event['total_price'])/1000000000000000000
                slug_price[collection_slug]['num_transactions'] += 1
        except:
            pass

async def collectionScrape():
    '''
    Utilizes asyncio and aiohttp to make asynchronous requests utilizing the delegated
    task function seen above. In this particular use case, we are simply getting collection
    information from OpenSea with the primary goal of getting the creation_date to be used 
    in transaction scraping. This API endpoint is public so no API key is required for this.
    '''
    tasks = []
    responses = []
    connector = TCPConnector()
    url = "https://api.opensea.io/api/v1/collection/{}"
    async with ClientSession(connector=connector) as session:
        for i in range(len(sample_slugs)):
            task = asyncio.ensure_future(fetchData(url.format(sample_slugs[i]),session))
            tasks.append(task)
        for f in tqdm.tqdm(asyncio.as_completed(tasks), total=len(tasks)):
            responses.append(await f)
    for response in responses:
        response = json.loads(response.decode('utf8'))
        collection = response['collection']
        name = collection['name']
        floor = collection['stats']['floor_price']
        volume = collection['stats']['total_volume']
        date_created = collection['created_date']
        slug_date[collection['slug']] = date_created
        slug_price[collection['slug']] = {'total_volume' : 0, 'num_transactions': 0 }
        print(f'{name}: Floor: {floor}ETH --- Total Volume: {volume}ETH --- Created {date_created}')


Run Collection Script - outputs to slug_date dictionary 

In [None]:
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(collectionScrape())
loop.run_until_complete(future)

Run Transaction Collections - outputs to slug_price dictionary

In [None]:
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(transactionScrape())
loop.run_until_complete(future)

Feature Extraction and Writing to Features File

In [None]:
df['volume'] = df['slug'].apply(lambda x: slug_price[x]['total_volume'])
df['transaction_count'] = df['slug'].apply(lambda x: slug_price[x]['num_transactions'])
df['average_sale'] = df['slug'].apply(lambda x: slug_price[x]['total_volume'] / slug_price[x]['num_transactions'])
df.to_csv('features.csv', mode='a', index=True, header=False)

### Twitter Historical Tweet Collection

Get Authorization Credentials

In [None]:
consumer_key = os.getenv('CONSUMER_KEY')
consumer_secret = os.getenv('CONSUMER_SECRET')
access_token = os.getenv('ACCESS_TOKEN')
access_token_secret = os.getenv('ACCESS_TOKEN_SECRET')

Configure Tweepy

In [None]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

Read in the Projects List

In [None]:
collections = pd.read_csv("collections.csv")["name"]
pprint(collections)

Tweet Collection Script

In [1]:
def get_tweets(proj):
    filename = "./data/" + "_".join(proj.split(" ")) + ".csv"
    print("Collecting tweets for:", proj)
    print("Writing results to", filename)
    resp = api.search_full_archive("prod", proj)
    tweets = []
    timestamps = []
    for status in resp:
        tweets.append(status.text)
        timestamps.append(status.created_at)
    df = pd.DataFrame({
            "tweets": pd.Series(tweets),
            "timestamps": pd.Series(timestamps)
            })
    df.to_csv(filename)

Run Tweet Collection Script

In [None]:
for proj in collections:
    get_tweets(proj)

## Use sentiment analysis to encode tweets

In [None]:
def get_sentiment(tweets):
    '''
    Uses a VADER model to return the avg sentiment for a collection of tweets
    '''
    analyzer = SentimentIntensityAnalyzer()
    all_scores = [0, 0, 0]
    for tweet in tweets:
        score = analyzer.polarity_scores(tweet)
        all_scores.append(np.array([score['neg'], score['neu'], score['pos']]))
    return np.array(all_scores).mean(axis=0)

In [None]:
def get_volume_metric(timestamps):
    '''
    Function to compute the standard deviation of a series of timestamps
    this is meant to act as a metric for volume
    in practice this is an imperfect metric because it is heavily reliant upon twitter's api. 
    we were not able to get a premium license so our data was limited
    '''
    distances = []
    for t in timestamps:
        # get the number of seconds since the epoch
        distances.append(calendar.timegm(time.strptime(t, "%Y-%m-%d %H:%M:%S+00:00")))
    return pd.Series(distances).std()

### Add sentiment encodings to feature dataframe

In [None]:
features = pd.read_csv("features.csv")
negative_scores = []
neutral_scores = []
positive_scores = []
volume = []

for proj in features["name"]:
    filename = "./data/" + "_".join(proj.split(" ")) + ".csv"
    tweet_data = pd.read_csv(filename)
    print("Analyzing tweets for", proj)
    scores = get_sentiment(tweet_data["tweets"])
    timestamp_deviation = get_volume_metric(tweet_data["timestamps"])
    try:
        negative_scores.append(scores[0])
        neutral_scores.append(scores[1])
        positive_scores.append(scores[2])
    except Exception:
        negative_scores.append(0)
        neutral_scores.append(0)
        positive_scores.append(0)
    if timestamp_deviation:
        volume.append(timestamp_deviation)
    else:
        volume.append(0)
        
features["negative_score"] = pd.Series(negative_scores)
features["neutral_score"] = pd.Series(neutral_scores)
features["positive_score"] = pd.Series(positive_scores)
features["tweet_volume"] = pd.Series(volume)

features.to_csv("features.csv")

In [None]:
features.head()

## Classification Models

### Gaussian Naive Bayes

In [None]:
def predictNB(x_train, x_test, y_train, y_test):
    gnb = GaussianNB()
    y_pred = gnb.fit(x_train, y_train).predict(x_test)
    return metrics.accuracy_score(y_test, y_pred)

### Support Vector Machine (SVM)

In [None]:
def predictSVM(x_train, x_test, y_train, y_test):
    model = svm.SVC()
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    return metrics.accuracy_score(y_test, y_pred)

### Multilayer Perceptron (MLP)

In [None]:
def predictMLP(x_train, x_test, y_train, y_test):
    n_features = x_train.shape[1]
    model = Sequential()
    model.add(Dense(20, activation='relu', kernel_initializer='he_normal', input_shape=(n_features,)))
    model.add(Dropout(0.5))
    model.add(Dense(20, activation='relu', kernel_initializer='he_normal'))
    model.add(Dropout(0.5))
    model.add(Dense(20, activation='relu', kernel_initializer='he_normal'))
    model.add(Dropout(0.5))
    model.add(Dense(5, activation='relu', kernel_initializer='he_normal'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    history = model.fit(x_train, y_train, epochs=1000, batch_size=8, verbose=0)
    pyplot.title('Learning Curve')
    pyplot.xlabel('Epoch')
    pyplot.ylabel('Binary Cross Entropy')
    pyplot.plot(history.history['loss'], label='train')
    pyplot.legend()
    pyplot.show()   
    loss, acc = model.evaluate(x_test, y_test, verbose=0)
    return acc


## Data Preprocessing

We will create different training sets using text features only, price features only, and a combination of price and text features.

### Extract Features and Labels

In [None]:
dataset = pd.read_csv("features.csv").fillna(0)

x_data_price = dataset[['transaction_count', 'volume', 'average_sale']]
x_data_text = dataset[['negative_score','neutral_score','positive_score','tweet_volume']]
x_data_combined = dataset[['transaction_count', 'volume', 'average_sale','negative_score','neutral_score','positive_score','tweet_volume']]

y_data = dataset['label']

### Split Data into Testing and Training

In [None]:
x_train_price, x_test_price, y_train_price, y_test_price = train_test_split(x_data_price, y_data ,test_size = 0.2)

x_train_text, x_test_text, y_train_text, y_test_text = train_test_split(x_data_text, y_data ,test_size = 0.2)

x_train_combined, x_test_combined, y_train_combined, y_test_combined = train_test_split(x_data_combined, y_data ,test_size = 0.2)

### Standardize The Features

In [None]:
scaler = preprocessing.StandardScaler().fit(x_train_price)
x_train_scaled_price = scaler.transform(x_train_price)

scaler = preprocessing.StandardScaler().fit(x_test_price)
x_test_scaled_price = scaler.transform(x_test_price)

scaler = preprocessing.StandardScaler().fit(x_train_text)
x_train_scaled_text = scaler.transform(x_train_text)

scaler = preprocessing.StandardScaler().fit(x_test_text)
x_test_scaled_text = scaler.transform(x_test_text)

scaler = preprocessing.StandardScaler().fit(x_train_combined)
x_train_scaled_combined = scaler.transform(x_train_combined)

scaler = preprocessing.StandardScaler().fit(x_test_combined)
x_test_scaled_combined = scaler.transform(x_test_combined)

### Preview the data

In [None]:
x_train_combined.head()

In [None]:
x_train_text.describe()

## Running the Models

## Text Data Only

### Naive Bayes Prediction

In [None]:
predictNB(x_train_scaled_text, x_test_scaled_text, y_train_text, y_test_text)

### SVM Model Prediction

In [None]:
predictSVM(x_train_scaled_text, x_test_scaled_text, y_train_text, y_test_text)

### MLP Model Prediction

In [None]:
predictMLP(x_train_scaled_text, x_test_scaled_text, y_train_text, y_test_text)

## Price Data Only

### Naive Bayes Prediction

In [None]:
predictNB(x_train_scaled_price, x_test_scaled_price, y_train_price, y_test_price)

### SVM Model Prediction

In [None]:
predictSVM(x_train_scaled_price, x_test_scaled_price, y_train_price, y_test_price)

### MLP Model Prediction

In [None]:
predictMLP(x_train_scaled_price, x_test_scaled_price, y_train_price, y_test_price)

## Price and Text Data

### Naive Bayes Prediction

In [None]:
predictNB(x_train_scaled_combined, x_test_scaled_combined, y_train_combined, y_test_combined)

### SVM Model Prediction

In [None]:
predictSVM(x_train_scaled_combined, x_test_scaled_combined, y_train_combined, y_test_combined)

### MLP Model Prediction

In [None]:
predictMLP(x_train_scaled_combined, x_test_scaled_combined, y_train_combined, y_test_combined)