# scraping news from Google news 

In [None]:
import feedparser
import datetime
import urllib.parse
import pandas as pd

# Specify the keywords and the number of years to scrape
keywords = ["amazon", "aws"]
num_years = 5

# Get the current date and the date `num_years` ago
now = datetime.datetime.now()
start_date = now.replace(year=now.year - num_years)

# Create empty lists to store the news headlines, dates, and URLs
headlines = []
dates = []
urls = []

# Loop through the dates and scrape the news for each date
current_date = start_date
while current_date <= now:
    # Format the date to match the RSS feed's date format
    date_str = current_date.strftime("%m-%d-%y")
    
    # Loop through the keywords and scrape the news for each keyword
    for keyword in keywords:
        # Build the URL for the RSS feed
        query = f"{keyword} when:{date_str}"
        query = urllib.parse.quote_plus(query)
        url = f"https://news.google.com/rss/search?q={query}&hl=en-US&gl=US&ceid=US:en"
        
        # Parse the RSS feed and extract the news headlines, dates, and URLs
        feed = feedparser.parse(url)
        for entry in feed.entries:
            headlines.append(entry.title)
            dates.append(current_date.date())
            urls.append(entry.link)
    
    # Move to the next date
    current_date += datetime.timedelta(days=1)

# Create a dataframe with the news headlines, dates, and URLs
df = pd.DataFrame({"date": dates, "headline": headlines, "url": urls})

# saving the dataframe
df.to_csv('google_news.csv')



In [6]:
import pandas as pd
google_news=pd.read_csv('google_news.csv')
google_news.head()

Unnamed: 0,Date,headline,url
0,01-01-2017,Forensic Amazon Analysis: A Value Equation App...,https://news.google.com/rss/articles/CBMiW2h0d...
1,01-01-2017,I bought Bitcoin from PayPal. Here's what happ...,https://news.google.com/rss/articles/CBMiWmh0d...
2,01-01-2017,At least 9 dead in Ugandan New Year firework c...,https://news.google.com/rss/articles/CBMiTWh0d...
3,01-01-2017,Tolino Epos 2 e-Reader Review - Good e-Reader,https://news.google.com/rss/articles/CBMiQmh0d...
4,01-01-2017,Which topics would you like to discuss with us...,https://news.google.com/rss/articles/CBMiXWh0d...


# scraping news from nytimes 

In [None]:
import datetime as dt
import pandas as pd
import requests

api_key = "ygAc9UuGvItoIu2SLRkdEVwedLAuSoca" #api provided by the nytimes

# Prompt for start and end dates
start_date_str = input("Enter start date (YYYY-MM-DD): ")
end_date_str = input("Enter end date (YYYY-MM-DD): ")
start_date = dt.datetime.strptime(start_date_str, "%Y-%m-%d").date()
end_date = dt.datetime.strptime(end_date_str, "%Y-%m-%d").date()

#asking for the keywords that we want to search
company_names = []
while True:
    company_name = input("Enter search query (or press enter to stop): ")
    if company_name == '':
        break
    company_names.append(company_name)

# Loop through each keyword and searching for articles
for company_name in company_names:
    print(f'Searching for "{company_name}" on New York Times...')

    # Specifying the number of pages you want to scrape
    num_pages = 1000  # Replace with the desired number of pages

    # Building the API request URL and retrieve the data for each page
    result = []
    for page in range(num_pages):
        url = f"https://api.nytimes.com/svc/search/v2/articlesearch.json?api-key={api_key}&q={company_name}&begin_date={start_date}&end_date={end_date}&page={page}"
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Error occurred: {response.text}")
            break
        data = response.json()

        # Extracting the relevant data from the response for the current page
        articles = data["response"]["docs"]
        for article in articles:
            article_dict = {}
            article_dict["title"] = article["headline"]["main"]
            article_dict["date"] = article["pub_date"]
            article_dict["url"] = article["web_url"]
            article_dict["content"] = article["abstract"] if "abstract" in article else ""
            result.append(article_dict)

    # Converting the result to a pandas DataFrame and print it
    df = pd.DataFrame(result)
    df.to_csv("nytimes_news", index=False)


In [5]:
nytimes_news=pd.read_csv('nytimes_news.csv')
nytimes_news.head()

Unnamed: 0,date,title,url,content
0,2012-01-01 11:24:45+00:00,Unboxed Extra: I.B.M. and America’s Job Challenge,https://bits.blogs.nytimes.com/2012/01/01/unbo...,I.B.M. is a corporate pioneer in globalization...
1,2012-01-02 23:58:32+00:00,"On Wall Street, Renewed Optimism for Deal-Making",https://dealbook.nytimes.com/2012/01/02/on-wal...,"According to a recent study by Ernst & Young, ..."
2,2012-01-06 18:39:50+00:00,A Historical Cycle Bodes Ill for the Markets,https://www.nytimes.com/2012/01/07/business/ec...,In what appears to be a recurring 15-year cycl...
3,2012-01-09 12:00:31+00:00,This Week in Small Business: For the Win!,https://boss.blogs.nytimes.com/2012/01/09/this...,Plus: Do your employees offer dissenting viewp...
4,2012-01-10 02:19:27+00:00,UniCredit’s Weak Share Offering a Poor Omen in...,https://dealbook.nytimes.com/2012/01/09/unicre...,Even a steep discount drew tepid interest in U...


# merging both news datasets

In [None]:
import pandas as pd

# Loading the first CSV file into a Pandas DataFrame
df1 = pd.read_csv('merged_file.csv')
df1['date'] = pd.to_datetime(df1['date'])  # Convert "date" column to datetime
df1.set_index('date', inplace=True)  # Set "date" as the index

# Loading the second CSV file into a Pandas DataFrame
df2 = pd.read_csv('2022.csv')
df2['date'] = pd.to_datetime(df2['date'])  # Convert "date" column to datetime
df2.set_index('date', inplace=True)  # Set "date" as the index

# Concatenating the two DataFrames along the rows (axis=0)
merged_df = pd.concat([df1, df2])

# Sorting the merged DataFrame by the index (date)
merged_df.sort_index(inplace=True)

# Saving the merged DataFrame to a new CSV file
merged_df.to_csv('google and nytimes news.csv')


In [9]:
google_and_nytimes_news=pd.read_csv('google and nytimes news.csv')
google_and_nytimes_news.head()

Unnamed: 0,Date,title
0,01-01-2012,Unboxed Extra: I.B.M. and Americaâ€™s Job Chal...
1,02-01-2012,"On Wall Street, Renewed Optimism for Deal-Making"
2,06-01-2012,A Historical Cycle Bodes Ill for the Markets
3,09-01-2012,This Week in Small Business: For the Win!
4,10-01-2012,UniCreditâ€™s Weak Share Offering a Poor Omen ...


# performing sentiment analysis on the news data


In [None]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

# Load the dataset
df = pd.read_csv("2022.csv")

# Initialize the sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Define a function to perform sentiment analysis on the title column
def analyze_sentiment(title):
    # Performing sentiment analysis on the title
    sentiment = analyzer.polarity_scores(title)
    compound = sentiment["compound"]
    positive = sentiment["pos"]
    negative = sentiment["neg"]
    neutral = sentiment["neu"]

    # Performing subjectivity analysis on the title
    subjectivity = TextBlob(title).sentiment.subjectivity

    return pd.Series({"compound": compound, "subjectivity": subjectivity, "positive": positive, "negative": negative, "neutral": neutral})

# Apply the sentiment analysis function to the title column
sentiment_df = df["title"].apply(analyze_sentiment)

# Concatenate the original dataframe and the sentiment dataframe
df = pd.concat([df, sentiment_df], axis=1)

# Print the updated dataframe
df.to_cav('news sentiment data.csv')


In [11]:
news_sentiment_data= pd.read_csv('news sentiment data.csv')
news_sentiment_data.head()

Unnamed: 0,Date,title,compound,positive,negative,neutral,subjectivity
0,01-01-2012,Unboxed Extra: I.B.M. and Americaâ€™s Job Chal...,0.9052,0.172,0.0,0.828,0.1
1,02-01-2012,"On Wall Street, Renewed Optimism for Deal-Making",0.0,0.0,0.0,1.0,0.0
2,06-01-2012,A Historical Cycle Bodes Ill for the Markets,0.0,0.0,0.0,1.0,0.5
3,09-01-2012,This Week in Small Business: For the Win!,0.0,0.0,0.0,1.0,0.4
4,10-01-2012,UniCreditâ€™s Weak Share Offering a Poor Omen ...,0.1779,0.11,0.084,0.806,0.6125


# getting the stock data from yahoofinance

In [None]:
import yfinance as yf

# Defining the ticker symbol and date range
ticker = "AMZN"
start_date = "2012-01-31"
end_date = "2022-12-31"

# Geting the data
data = yf.download(ticker, start=start_date, end=end_date)

# Print the data
data.head()
data.to_csv("AMZN.csv")

# merging the stock dataset and the news dataset

In [None]:
#merging two files
import pandas as pd
# Load the two CSV files into pandas dataframes
df1 = pd.read_excel("C:/Users/vaibhav semwal/Desktop/merged1.xlsx")
df2 = pd.read_excel("C:/Users/vaibhav semwal/Desktop/NDX (2).xlsx")

# Converting the date column in both dataframes to a common format
df1['Date'] = pd.to_datetime(df1['Date'], format='%d-%m-%Y')
df2['Date'] = pd.to_datetime(df2['Date'], format='%d-%m-%Y')

# Merging the dataframes based on the 'date' column
merged_df = pd.merge(df1, df2, on='Date', how='left')

# Saving the merged dataframe to a new CSV file
merged_df.to_csv('sentiments and stock.csv', index=False)

In [13]:
sentiments_and_stock= pd.read_csv('sentiments and stock.csv')
sentiments_and_stock.head()

Unnamed: 0.1,Unnamed: 0,Date,title,compound,positive,negative,neutral,subjectivity,Open,High,Low,Close,Adj Close,Volume
0,0,2012-01-06,A Historical Cycle Bodes Ill for the Markets,0.0,0.0,0.0,1.0,0.5,8.9035,9.2325,8.875,9.1305,9.1305,140168000
1,1,2012-01-09,This Week in Small Business: For the Win!,0.0,0.0,0.0,1.0,0.4,9.138,9.2185,8.85,8.928,8.928,101138000
2,2,2012-01-10,UniCreditÃ¢â‚¬â„¢s Weak Share Offering a Poor ...,0.1779,0.11,0.084,0.806,0.6125,9.055,9.12,8.855,8.967,8.967,79716000
3,3,2012-01-11,"Stanley Kwan, Hang Seng Index Creator, Dies at 86",0.0,0.0,0.0,1.0,0.0,8.982,9.0385,8.9095,8.945,8.945,62054000
4,4,2012-01-17,European Central Bankers Criticize Role of Rat...,-0.1027,0.0,0.053,0.947,0.125,9.0075,9.165,8.9255,9.083,9.083,112890000


# performing EDA

In [None]:
import pandas as pd
import pandas_profiling as pp

# Load the dataset
df = pd.read_csv("final datasetcsv")

# Generate a report with statistics about each column
report = pp.ProfileReport(df)

# Print the report
print(report)

# Applying Linear regression

In [None]:
# Import necessary libraries
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load dataset into a pandas DataFrame
df =pd.read_csv("final dataset.csv")

# Replace infinite values with NaN and fill with column means
df = df.replace([np.inf, -np.inf], np.nan)
df = df.fillna(df.mean())

# Split the data into training and testing sets
X = df[['compound', 'subjectivity', 'Open']]
y_high = df['High']
y_low = df['Low']
X_train, X_test, y_high_train, y_high_test, y_low_train, y_low_test = train_test_split(X, y_high, y_low, test_size=0.2, random_state=42)

# Train a linear regression model to predict high column
lr_high = LinearRegression()
lr_high.fit(X_train, y_high_train)

# Train a linear regression model to predict low column
lr_low = LinearRegression()
lr_low.fit(X_train, y_low_train)

# Predict the high and low columns on the testing set
y_high_pred = lr_high.predict(X_test)
y_low_pred = lr_low.predict(X_test)

# Calculate the root mean squared error for high and low predictions
rmse_high = np.sqrt(mean_squared_error(y_high_test, y_high_pred))
rmse_low = np.sqrt(mean_squared_error(y_low_test, y_low_pred))

print(f"RMSE for High column: {rmse_high}")
print(f"RMSE for Low column: {rmse_low}")


In [None]:
# Create a new DataFrame with the input features for the new data
new_data = pd.DataFrame({'compound': [-0.5251], 'subjectivity': [0.387777778], 'Open': [98.7]})

# Predict the high and low columns for the new data using the trained models
new_high_pred = lr_high.predict(new_data)
new_low_pred = lr_low.predict(new_data)

print(f"Predicted High: {new_high_pred[0]}")
print(f"Predicted Low: {new_low_pred[0]}")


# Applying LSTM

In [None]:

import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler


# Define SMAPE metric for evaluation
def smape_kun(y_true, y_pred):
    return np.mean((np.abs(y_pred - y_true) * 200/ (np.abs(y_pred) + np.abs(y_true))))

# Load data from CSV file and set 'Date' column as index
data = pd.read_csv('final dataset.csv', index_col='Date', parse_dates=True)

# Select relevant columns and drop any rows with missing values
exog_vars = ['subjectivity','compound', 'Open','High','Adj Close']
data = data[exog_vars + ['Close']].dropna()

# Split data into training and testing sets
train_size = int(len(data) * 0.7)
train_data, test_data = data.iloc[:train_size], data.iloc[train_size:]

# Normalize data using min-max scaling
scaler = MinMaxScaler()
train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.transform(test_data)

# Define function to create time series dataset
def create_time_series_dataset(X, y, lookback):
    Xs, ys = [], []
    for i in range(len(X) - lookback):
        Xs.append(X[i:(i+lookback)])
        ys.append(y[i+lookback])
    return np.array(Xs), np.array(ys)

# Define hyperparameters and create time series datasets
lookback = 60
batch_size = 32
train_X, train_y = create_time_series_dataset(train_data_scaled, train_data_scaled[:, -1], lookback)
test_X, test_y = create_time_series_dataset(test_data_scaled, test_data_scaled[:, -1], lookback)

# Define LSTM model architecture
model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(64, return_sequences=True, input_shape=(lookback, train_X.shape[-1])),
    tf.keras.layers.LSTM(32, return_sequences=False),
    tf.keras.layers.Dense(1)
])

# Compile and fit model
model.compile(optimizer='adam', loss='mse')
model.fit(train_X, train_y, epochs=10, batch_size=batch_size, validation_data=(test_X, test_y))

# Make predictions on test set and scale back to original values
predictions = model.predict(test_X)
predictions = scaler.inverse_transform(np.concatenate((test_X[:, -1, :-1], predictions), axis=1))[:, -1]

# Calculate error metrics
error = mean_squared_error(test_data['Close'][lookback:], predictions)
print('Testing Mean Squared Error: %.3f' % error)

error2 = smape_kun(test_data['Close'][lookback:].values, predictions)
print('Symmetric mean absolute percentage error: %.3f' % error2)


In [None]:
import matplotlib.pyplot as plt

# Plot predicted vs actual values
plt.figure(figsize=(10, 6))
plt.plot(test_data.index[lookback:], test_data['Close'][lookback:], label='Actual')
plt.plot(test_data.index[lookback:], predictions, label='Predicted')
plt.title('Actual vs Predicted Close Prices')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()
plt.show()
