## Twitter Scraping

In [None]:
# Please note that the snscrape can't collect RETWEETS
# Avoid high frequency requests:
#  - For ONE keywords/users, you collect 100K tweets                     -> ONE request, it's fine
#  - For 1000 keywords/users, you collect 1 tweet from each keyword/user -> 1K requests, may cause IP blocking

# Please make sure the snscrape has been installed in the Python environment
# If you are using PythonAnywhere
#   Step 1: Open $Bash on your PythonAnywhere Dashboard
#   Step 2: Enter the following command: pip3.9 install --user git+https://github.com/JustAnotherArchivist/snscrape.git
# The Python version has to be greater than 3.8

import snscrape.modules.twitter as sntwitter
import pandas as pd

key_word = "biodegradable OR carbon OR climate OR climateaction OR climatechange OR climatecrisis OR earthday OR ecologic OR environment OR environmental OR nature OR natureforall OR organic OR paper OR paperbag OR paperbased OR paperless OR planet OR planetary OR plastic OR preserve OR recyclable OR recycle OR recycling OR reusable OR reusing OR sustainability OR sustainable OR unsustainable AND shein" 

# used for shein since it is in the keyword

user_name = "@SHEIN_Official"   # Declare a user name used to search tweets -> Tweet search by user
from_date = "2018-11-01"      # Declare a start date
end_date = '2022-11-01'       # Declare a end date
count =30000                   # The maximum number of tweets

tweets_list_keyword = [] # A list used to store the returned results for keyword search
tweets_list_user = []    # A list used to store the retuned results for user search

#### Scraping tweets from a specific keyword ####
command_keyword = key_word+' since:'+from_date+' until:'+end_date
print("Scraping data for keyword:",key_word)
for i,tweet in enumerate(sntwitter.TwitterSearchScraper(command_keyword).get_items()):
    # For other available attributes: https://github.com/JustAnotherArchivist/snscrape/issues/115
    tweets_list_keyword.append([tweet.date,tweet.id, tweet.rawContent, tweet.user.username, tweet.url,tweet.lang])
    if i>count:
        break;

# Create a dataframe from the tweets list above
tweets_df_keyword = pd.DataFrame(tweets_list_keyword, columns=['Datetime','Tweet Id', 'Text', 'Username', 'url','language'])
tweets_df_keyword['Datetime'] = tweets_df_keyword['Datetime'].astype(str).str[:-6]
tweets_df_keyword.to_csv("tweets_keywords_shein.csv",index=False) # Export to a csv file
tweets_df_keyword.to_excel("tweets_keywords_shein.xlsx",index=False) # Uncomment this line if you prefer an Excel file
print("Scraped data have been exported to the csv file")


Scraping tweets from a specific user’s account
command_user = 'from:'+user_name+' since:'+from_date+' until:'+end_date
print("Scraping data for user:",user_name) # we updated the user_name with the brand name
for i,tweet in enumerate(sntwitter.TwitterSearchScraper(command_user).get_items()):
    tweets_list_user.append([tweet.date,tweet.id, tweet.rawContent, tweet.user.username, tweet.url])
    if i > count:
        break;
# Create a dataframe from the tweets list above
tweets_df_user = pd.DataFrame(tweets_list_user, columns=['Datetime','Tweet Id', 'Text', 'Username', 'url'])
tweets_df_user.to_csv("tweets_users_patagonia.csv",index=False) # Export to a csv file
tweets_df_user['Datetime'] = tweets_df_user['Datetime'].astype(str).str[:-6]
tweets_df_user.to_excel("tweets_users_patagonia.xlsx",index=False) # Uncomment this line if you prefer an Excel file
print("Scraped data have been exported to the csv file")

## Word Freq

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import csv
from nltk.corpus import stopwords
import re
import string
from collections import defaultdict
import nltk
nltk.download('stopwords')
stop = stopwords.words('english')

inter1 = []
sentences_all = []
sentences_clean = []
sentences_unpun = []

dictionary1 = {}
d2_dict = defaultdict(dict)

with open('tweets_users_patagonia.csv') as f:
    rows = csv.reader(f, delimiter = ',')
    for row in rows:
        inter1.append(row[2])

# Split the row into different sentences
for row in inter1:
    sentences = re.split(r' *[\.\?!][\'"\)\]]* *', row)
    for s in sentences:
        in1 = ''.join(s)
        out = re.sub('[%s]' % re.escape(string.punctuation), '', in1.lower())
        sentences_all.append(out)

# Remove stop words from sentence
for sentence in sentences_all:
    s = []
    for i in sentence.split():
        if i not in stop and i.isdigit() is False:
            s.append(i)
    sentences_clean.append(s)


# Add each words as key into a dictionary
for sentence in sentences_clean:
    #print sentence
    for word in sentence:
        dictionary1[word] = 0

# Update the frequency dictionary table
for sentence in sentences_clean:
    for word in sentence:
        dictionary1[word] = dictionary1[word] + 1

# Add each pair of words as key into a dictionary 2
for sentence in sentences_clean:
    for word in sentence:
        for word2 in sentence:
            if(word != word2):
                d2_dict[word][word2] = 0
# Update the frequency dictionary table
for sentence in sentences_clean:
    for word in sentence:
        for word2 in sentence:
            if(word != word2):
                d2_dict[word][word2] = d2_dict[word][word2] + 1
                

writer = csv.writer(open('word_freq.csv', 'w',newline=''))
for key, value in dictionary1.items():
    writer.writerow([key, value])

writer = csv.writer(open('word_pair_freq.csv', 'w',newline=''))
for key1, value1 in d2_dict.items():
    for key2, value2 in d2_dict[key1].items():
        writer.writerow([key1, key2, value2])

print ("Wrote to word_freq.csv")

## Find and Replace


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import csv

# Define the input file name
filename = "C:/Users/teghw/Desktop/Fall 2022/INSY 448 - Text and Social Media Analytics/final project/scraping from companies/zara/tweets_users_zara.csv"
# Define the output file name
output = "C:/Users/teghw/Desktop/Fall 2022/INSY 448 - Text and Social Media Analytics/final project/find and replace/by company/zara_company_new.csv"
# The list used to stored the replaced csv files 

output_list=[]
with open(filename, 'r') as csvFile:
    reader = csv.reader(csvFile, delimiter=',', quotechar='"')
    for row in reader:
        #This item is the forum post (3rd elements of each row)
        with open("C:/Users/teghw/Desktop/Fall 2022/INSY 448 - Text and Social Media Analytics/final project/find and replace/keys.csv", 'r') as csvfile:
            read = csv.reader(csvfile, delimiter=',')
            for row2 in read:
                #Find and Replace in Bruteforce way
                row[2] = row[2].lower().replace(" " + row2[1].lower() + " ",row2[0].lower())
        output_list.append(row)

# Write the values in output list to the output file
with open(output, 'w', newline='') as output:
    writer = csv.writer(output, quoting=csv.QUOTE_ALL)
    writer.writerows(output_list)
print ("Wrote to x_company_new.csv")

## Sentiment 

In [None]:
#!/usr/bin/python3.0
# if required install twython: pip install twython
# the input filename is limit_post.csv in line 34, change it as needed
# the output file is sentiment_data.xlsx, change it when running the script multiple times
import pandas as pd
import numpy as np
import nltk

nltk.download("vader_lexicon")
def get_sentiment(rating_data):
    """
    https: // github.com / cjhutto / vaderSentiment
    """
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    sid = SentimentIntensityAnalyzer()
    rating_data['sent_neg'] = -10
    rating_data['sent_neu'] = -10
    rating_data['sent_pos'] = -10
    rating_data['sent_compound'] = -10
    for i in range(len(rating_data)):
        sentence = rating_data['Sentences'][i]
        ss = sid.polarity_scores(sentence.encode('ascii', 'ignore').decode("ascii"))
        #print (ss['neg'])
        rating_data.iloc[i, 1] = float(ss['neg'])
        #print (rating_data.iloc[i, 1])
        rating_data.iloc[i, 2] = ss['neu']
        rating_data.iloc[i, 3] = ss['pos']
        rating_data.iloc[i, 4] = ss['compound']
    return rating_data

rating_data = pd.read_csv("limit_post.csv", encoding = 'latin1')
rating_data = rating_data.rename(columns={rating_data.columns[0]: "Sentences" })
sentiment_data = get_sentiment(rating_data)
sentiment_data.to_excel("sentiment_data.xlsx", index = False)
print (" Written to sentiment_data.xlsx")