**Twitter API connection**

In [1]:
import tweepy
from os import getenv
from dotenv import load_dotenv
load_dotenv()

TWITTER_CONSUMER_API_KEY = getenv('TWITTER_CONSUMER_API_KEY')
TWITTER_CONSUMER_API_SECRET = getenv('TWITTER_CONSUMER_API_SECRET')
TWITTER_ACCESS_TOKEN = getenv('TWITTER_ACCESS_TOKEN')
TWITTER_ACCESS_TOKEN_SECRET = getenv('TWITTER_ACCESS_TOKEN_SECRET')

TWITTER_AUTH = tweepy.OAuthHandler(TWITTER_CONSUMER_API_KEY,TWITTER_CONSUMER_API_SECRET)
TWITTER_AUTH.set_access_token(TWITTER_ACCESS_TOKEN, TWITTER_ACCESS_TOKEN_SECRET)
TWITTER = tweepy.API(TWITTER_AUTH)

**Tweet preprocessing**

In [2]:
import re

def clean_tweet(tweet):
    '''Cleaning raw tweet for modeling'''

    tweet = tweet.lower()
    emoji_list = tweet.split()                  # Creating list to reference emojis
    tweet = re.sub('[^a-z 0-9]', '', tweet)
    tweet = tweet.split(' ')
    output_string = ''

    # Creating output string, handeling links and emojis
    for index, token in enumerate(tweet):
        if token[0:4] == 'http':
            pass
        elif token == '':
            try:
                output_string = output_string + ' ' + str(ord(emoji_list[index]))   # Replacing emoji with number value
            except:
                # Token is not an emoji
                pass
        else:
            output_string = output_string + ' ' + token

    output_string = output_string[1:]

    return output_string

**Collect tweets and generate dataframe**

In [3]:
import pandas as pd

def generate_tweets(user1, user2):
    tweets = []
    users = []

    for status in tweepy.Cursor(TWITTER.user_timeline, screen_name=user1, tweet_mode="extended", count=200).items():
        tweets.append(clean_tweet(status.full_text))
        users.append(user1)

    for status in tweepy.Cursor(TWITTER.user_timeline, screen_name=user2, tweet_mode="extended", count=200).items():
        tweets.append(clean_tweet(status.full_text))
        users.append(user2)

    return pd.DataFrame({'tweet':tweets, 'user':users})

user1 = '@nasa'
user2 = '@justinbieber'
df = generate_tweets(user1, user2)

**Split and vectorize data**

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

df = df.sample(frac=1).reset_index(drop=True)

X = df['tweet']
y = df['user']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

vectorizer = CountVectorizer()
train_vector = vectorizer.fit_transform(X_train)
test_vector = vectorizer.transform(X_test)

**Preform logistic regression**

In [5]:
from sklearn.linear_model import LogisticRegression

clr = LogisticRegression()
clr.fit(train_vector, y_train.values.ravel())
scores = clr.score(test_vector, y_test) # accuracy
print(scores)

0.9637577491654745


In [6]:
tweet = 'This week at NASA, we celebrate @NASAPersevere'
tweet = clean_tweet(tweet)
tweet = pd.DataFrame({'tweet':[tweet]})
tweet = vectorizer.transform(tweet['tweet'])
clr.predict(tweet)

array(['@nasa'], dtype=object)